def do_GET(self): GET_parameters = parse_qs(urlparse.urlparse(self.path).query) currency_code = GET_parameters.get(CURRENCY_CODE_PARAMETER) currency_code = currency_code[0] if currency_code else None amount = GET_parameters.get(REQUESTED_AMOUNT_PARAMETER) amount = amount[0] if amount else 1 if currency_code == USD_CODE and isfloat(amount): converted_value = currencies_parser.convert_roubles( currency_code, float(amount)) self.send_response(200) self._set_headers() self._json_response({ 'currency_code': USD_CODE, 'requested_amount': float(amount), 'result_amount': converted_value }) else: msg = '' if currency_code is None: msg = 'Not found "currency_code" parameter' elif currency_code and currency_code != USD_CODE: msg = 'ValueError in "currency_code" parameter.' elif not isfloat(amount): msg = 'ValueError: "amount" parameter must be a number' self.send_response(404) self._set_headers() self._json_response({'error': msg})
def main(): #get file handle path = '/home/orcudy/Desktop/cs170a/data/Iris.csv' fd = open(path, 'r') lines = list(fd) fd.close() #retrieve flower data featureIDs = csv.generateFeatureIDList(lines, ',') #remove whitespace from IDs for index in range(len(featureIDs)): featureIDs[index] = filter(lambda x: x.isalnum(), featureIDs[index]) #generate feature map and convert to appropriate type floatConversionUnit = (featureIDs, lambda x: float(x) if utils.isfloat(x) else -1.0) dataMap = csv.convertType( csv.generateDataMap(lines, featureIDs), [floatConversionUnit]) #set parameters for analysis species = csv.generateArray(dataMap, ['Species']).T basePath = '/home/orcudy/Desktop/cs170a/logs' ID = 'iris' data = csv.generateArray(dataMap, featureIDs) correlation = linalg.computeCorrelationMatrix(data, featureIDs, True, basePath, ID) u, s, vt = linalg.computeSVD(correlation, featureIDs, True, basePath, ID) renderIrisPCA(u, data, species)
def _import_dx(filename): origin = delta = data = dims = None counter = 0 with open(filename, 'r') as dxfile: for row in dxfile: row = row.strip().split() if not row: continue if row[0] == '#': continue elif row[0] == 'origin': origin = np.array(row[1:], dtype=float) elif row[0] == 'delta': delta = np.array(row[2:], dtype=float) elif row[0] == 'object': if row[1] == '1': dims = np.array(row[-3:], dtype=int) data = np.empty(np.prod(dims)) elif isfloat(row[0]): data[3 * counter:min(3 * (counter + 1), len(data))] = np.array( row, dtype=float) counter += 1 data = data.reshape(dims) return origin, delta, data
def classify_headers(headers): # first check if we are a list of numbers if all(isfloat(x) for x in headers[1:]): return HeaderTypes.Data # fuzzy check against database variety_names = [v.name for v in models.Variety.objects.all()] location_names = [l.name for l in models.Location.objects.all()] measure_names = models.TrialEntry.measures + tuple( measure_aliases.keys()) vratio = fuzz.token_sort_ratio(headers, ' '.join(variety_names)) lratio = fuzz.token_sort_ratio(headers, ' '.join(location_names)) mratio = fuzz.token_sort_ratio(headers, ' '.join(measure_names)) candidates = ( (vratio, HeaderTypes.Varieties), (lratio, HeaderTypes.Locations), (mratio, HeaderTypes.Measures), ) winner = sorted( candidates, reverse=True, key=lambda ratio_type: ratio_type[0], )[0] ratio, _type = winner if ratio < 40: _type = None return _type
def parseGetCarriedObjects(ch, argL, cmd): """ Parse and return the items carried by the player, listed in argL. Supports: cmd item cmd i.item cmd n item cmd all item cmd money cmd n currency cmd all currency """ if utils.isnumber(argL[0]): n = int(argL[0]) i = 1 if n <= 0: ch.writeToSelf("Heh heh.. sure.. why don't you try that in some alternative universe?\r\n") return 1 elif utils.isfloat(argL[0]): n = string.atof(argL[0]) i = 1 if n <= 0: ch.writeToSelf("Heh heh.. sure.. why don't you try that in some alternative universe?\r\n") return 1 elif argL[0] == "all": n = -1 i = 1 else: n = 1 i = 0 if len(argL)-1 < i: ch.writeToOthers("How many of what do you want to "+cmd+"?\r\n") return None if n == 1: obj = ch.findCarriedObject(argL[i]) if not obj: ch.writeToSelf("You don't have "+argL[i]+".\r\n") return None objL = [obj] else: objL = ch.findCarriedObjects(argL[i], n) if not objL: ch.writeToSelf("You don't have "+argL[i]+".\r\n") return None if len(objL) != n and not isinstance(objL[0], Money): ch.writeToSelf("You don't have that many "+argL[i]+".\r\n") return None return (objL, i)
def test_isfloat(self): self.assertTrue(utils.isfloat('0')) self.assertTrue(utils.isfloat('1')) self.assertTrue(utils.isfloat('1.02')) self.assertTrue(utils.isfloat('100000')) self.assertTrue(utils.isfloat('-1')) self.assertFalse(utils.isfloat('lorem')) self.assertFalse(utils.isfloat('2t2'))
def summary(self,*vars): """Display a summary of all recorded notes, checks, records.""" if vars: print("Values Used:") print("============") print() show(*vars,depth=1) var = self.var hd = 'Summary of' if var is not None: hd += ' '+var+' for ' hd += self.__class__.__name__ if self.title: hd += ': '+str(self.title) print() print(hd) print('=' * len(hd)) print() if self._notes: print('Notes:') print('------') for txt in self._notes: print(' -',txt) print() if self._checks: print('Checks:') print('-------') width = max([len(l) for f,l,v,d in self._checks]) for chk in self._checks: print(self.fmt_check(chk,width=width+2)) print() hd = 'Values' if self.var: hd += ' of '+self.var hd += ':' print(hd) print('-'*len(hd)) width = max([len(l) for l,v,d in self._record]) govval = None if var: govval = self.selector([d[var] for l,v,d in self._record]) for rec in self._record: print(self.fmt_record(rec,var=var,width=width+1,govval=govval,nsigfigs=self.nsigfigs)) if govval is not None: print() h = 'Governing Value:' print(' ',h) print(' ','-'*len(h)) print(' ','{0} = {1}'.format(var,(sfrounds(govval,self.nsigfigs) if isfloat(govval) else "{0!r}".format(govval))), self.units if self.units is not None else '')
def on_main_button_click(self, sender, sender_name): if sender_name != 'OK': self.destroy() return if not( isfloat(self.res['ms1']) and isfloat(self.res['ms2']) ): return ms1 = int(self.res['ms1']) ms2 = int(self.res['ms2']) # Create list of subtitles to apply changes applyItems = [] if self.res['applyToSubs'] == 'all lines': for item in self.subtitleModel.get_model(): applyItems.append(item[0]) else: self.get_tv_selection() if len(self.tvSelectionList) == 0: self.destroy() return applyItems = self.tvSelectionList[:] A0 = int(applyItems[0].startTime) B0 = int(applyItems[-1].startTime) A = int(applyItems[0].startTime) + ms1 * (1 if self.res['op1'] == 'Add' else -1) B = int(applyItems[-1].startTime) + ms2 * (1 if self.res['op2'] == 'Add' else -1) if A0 == B0: return for item in applyItems: duration = int(item.duration) new_start_time = int( grad(A0, B0, A, B, int(item.startTime)) ) new_stop_time = int(new_start_time) + duration self.changeList.append( (item, int(item.startTime), int(item.stopTime), int(new_start_time), int(new_stop_time)) ) item.startTime = new_start_time item.stopTime = new_stop_time self.destroy()
def _validate(value): """ Ensures that a candidate token contains only valid characters and that a single token contains only letters or only numbers :param value: string that will be the value of the token """ if LEGAL_CHARS.match(value) is None: raise ValueError('Contains illegal characters') # Must either be a float or alpha chars if not isfloat(value) and not value.isalpha(): raise ValueError('Token contains letters and numbers')
def load(filepath): attrs = {} with open(filepath, "r") as f: for line in f: s = [i.strip() for i in line.split(":")] if s[1] == "None": attrs[s[0]] = None elif utils.isfloat(s[1]): attrs[s[0]] = float(s[1]) elif s[1].isdigit(): attrs[s[0]] = int(s[1]) else: attrs[s[0]] = s[1] return Dict(**attrs)
def findoutliers(database): spark = SparkSession.builder.appName( "Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate() sc = spark.sparkContext sc.addFile("utils.py") sc.addFile("onumeric.py") sc.addFile("ostring.py") lines = sc.textFile(database) headers = lines.first() rows = lines #rows = lines.filter(lambda x: x!=headers) headers = headers.split("\t") rows = rows.map(lambda l: l.split("\t")) rowsdf = spark.createDataFrame(rows) outlierslist = [] for i in sc.range(len(headers)).collect(): header = headers[i] valuelistrdd = rowsdf.select("_" + str(i + 1)).rdd.flatMap(lambda x: x) valuelist = valuelistrdd.collect() valuelist = valuelist[1:] rand_smpl = [ valuelist[i] for i in sorted(random.sample(range(len(valuelist)), 10)) ] smpl_type = utils.gettype(rand_smpl) if smpl_type == 'Numeric': valuelistnew = onull.find_outliers(valuelistrdd) valuelistrdd = valuelistrdd.filter(lambda x: utils.isfloat(x)) valuelistnew = valuelistnew + onumeric.find_outliers(valuelistrdd) elif smpl_type == 'String': valuelistnew = [] valuelistnew = ostring.find_outliers(valuelistrdd) elif smpl_type == 'Date': valuelistnew = onull.find_outliers(valuelistrdd) valuelistrdd = valuelistrdd.filter(lambda x: utils.isdate(x)) valuelistnew = valuelistnew + odate.find_outliers(valuelistrdd) elif smpl_type == 'None': valuelistnew = [] outlierslist.append((header, valuelistnew)) database = sc.parallelize(outlierslist) database.map(lambda x: "{0} \t {1}".format(x[0], x[1])).saveAsTextFile( "x.out")
def parse_dirty_dataset(path): rows = [] with open(path) as csvfile: for row in csv.DictReader(csvfile): record = {} for field_name, value in row.items(): if field_name in field_map: value = value.strip().replace(',', '.') if utils.isfloat(value): value = float(value) elif value == '-' or value == '': value = make_average_value(field_name) elif field_name in text_fields: pass else: raise Exception("unknown field '%s'" % repr(field_name)) record[field_name] = value rows.append(record) return rows
def keywords(self): # Creo un dizionario contenente tutte le parole che non sono stopwords # in forma lemmatizzata # chiave = parola lemmatizzata # valore = elenco di istanze di tutte le parole che hanno la stessa lemmatizzazione stopw = self.parent.parent.stopw words = [ wd for wd in self.words if not wd.lemmatized in stopw and wd.text not in string.punctuation and not isfloat(wd.text) ] word_dict = {} for word in words: if not word.lemmatized in word_dict.keys(): word_dict[word.lemmatized] = [word] else: word_dict[word.lemmatized].append(word) return word_dict
def main(): #get file handle path = '/home/orcudy/Desktop/cs170a/data/CSV.csv' fd = open(path, 'r') lines = list(fd) fd.close() #cleanup first and last question IDs (others are already clean) lines[0] = lines[0].split(',') for index in [0,-1]: lines[0][index] = filter(lambda x: x.isalnum(), lines[0][index]) lines[0] = ','.join(lines[0]) #retrieve question IDs questionIDs = csv.generateFeatureIDList(lines, ',') #data dynamics chars = ['usr'] floats = ['weight', 'standwt'] ints = filter(lambda x: x not in chars + floats, questionIDs) #convert feature map to appropriate type floatConversionUnit = (floats, lambda x: float(x) if utils.isfloat(x) else -1.0) intConversionUnit = (ints, lambda x: int(x) if x.isdigit() else -1) responseMap = csv.convertType( csv.generateDataMap(lines, questionIDs), [floatConversionUnit, intConversionUnit]) #mask "Don't know" and "Refused" responses maskValues = [-1, 8, 9, 98, 99, 998, 999, 9998, 9999, 99999] position = csv.generateArray(responseMap, keys.position).T basePath = '/home/orcudy/Desktop/cs170a/logs' #begin analysis datakeys = [keys.demographic, keys.phone, keys.control, keys.internet, keys.social, keys.government, keys.exclusion] IDs = ['PCA on Demographic Category', 'PCA on Phone Category', 'PCA on Control Category', 'PCA on Internet category', 'PCA on Social Category', 'PCA on Government Category', 'PCA on Exclusion Category'] for index in range(len(datakeys)): key = datakeys[index] ID = IDs[index] data = csv.generateMaskedArray(responseMap, key, maskValues) correlation = linalg.computeCorrelationMatrix(data, keys=key, log=True, path=basePath, keyname=ID) u, s, vt = linalg.computeSVD(correlation, keys=key, log=True, path=basePath, keyname=ID) renderPCA(u, data, position, ID)
def check_orthography(self, sub): if not self.dict: return wordList = re.sub("[\W]", " ", remove_tags(sub.text), flags=re.UNICODE).split() errorList = [ word for word in wordList if (not self.dict.check(word)) and (not isfloat(word)) ] errorCoord = [] for word in errorList: coords = find_all_str(sub.text, word) errorCoord += [coord for coord in coords] if len(errorList) > 0: sub.info = ('Text-Error-Orthography', (u'Ορθογραφία: ' + ','.join(w for w in errorList), errorCoord)) else: sub.info = ('Text-Error-Orthography', '')
def fmt_record(self,rec,width=None,var=None,govval=None,nsigfigs=4): """Format a computation record for display.""" label,_varlist,_vars = rec _vars = _vars.copy() if width is None: width = len(label) if var is None: var = self.var ans = " {label:<{width}} ".format(label=label+':',width=width) if var: val = _vars.pop(var) ##print(val, type(val)) ans += '{0} = {1}'.format(var,(sfrounds(val,nsigfigs) if isfloat(val) else "{0!r}".format(val))) if self.units: ans += ' '+str(self.units) if govval is not None: if val == govval: ans += ' <-- governs' if _vars: ans += '\n ('+fmt_dict(_vars)+')' return ans
def atom(self, tokens): token = tokens.pop() node = None if isfloat(token.value): node = Number(token.value) elif self.match(token, (TokenType.TRUE, TokenType.FALSE)): node = Boolean(token.value) elif self.match(token, (TokenType.EMPTY, )): node = Empty() elif self.match(token, (TokenType.CONST, )): node = String(token.value) elif self.match(token, (TokenType.IDENT, )): if self.match(tokens.peek(), (TokenType.STORE, TokenType.OPEN_PAREN)): node = Name(token.value, Context.STORE) elif not self.match( tokens.peek(), (TokenType.OPEN_PAREN, )): # not a function dec node = Name(token.value, Context.LOAD) if node: self.callstack.push(token) return node
def classify_headers(headers): # first check if we are a list of numbers if all(isfloat(x) for x in headers[1:]): return HeaderTypes.Data # fuzzy check against database variety_names = [v.name for v in models.Variety.objects.all()] location_names = [l.name for l in models.Location.objects.all()] measure_names = models.TrialEntry.measures + tuple(measure_aliases.keys()) vratio = fuzz.token_sort_ratio(headers, " ".join(variety_names)) lratio = fuzz.token_sort_ratio(headers, " ".join(location_names)) mratio = fuzz.token_sort_ratio(headers, " ".join(measure_names)) candidates = ( (vratio, HeaderTypes.Varieties), (lratio, HeaderTypes.Locations), (mratio, HeaderTypes.Measures), ) winner = sorted(candidates, reverse=True, key=lambda ratio_type: ratio_type[0])[0] ratio, _type = winner if ratio < 40: _type = None return _type
def on_main_button_click(self, sender, sender_name): if sender_name != 'OK': self.destroy() return if self.res['applyToSubs'] is None or self.res['milliseconds1'] is None or self.res['operation1'] is None or self.res['applyToTime1'] is None: self.destroy() return if self.res['applyToTime1'] == 'Start Time' and (self.res['operation2'] is not None and not isfloat(self.res['milliseconds2'])): self.destroy() return if not isfloat(self.res['milliseconds1']): self.destroy() return # Create list of subtitles to apply changes applyItems = [] if self.res['applyToSubs'] == 'all lines': for item in self.subtitleModel.get_model(): applyItems.append(item[0]) else: self.get_tv_selection() if len(self.tvSelectionList) == 0: self.destroy() return applyItems = self.tvSelectionList[:] if self.res['milliseconds1'] is not None: milliseconds1 = int(self.res['milliseconds1']) * (1 if self.res['operation1'] == 'add' else -1) else: milliseconds1 = 0 if self.res['milliseconds2'] is not None: milliseconds2 = int(self.res['milliseconds2']) * (1 if self.res['operation1'] == 'add' else -1) else: milliseconds2 = 0 operation1Items = applyItems[:] if milliseconds1 > 0: operation1Items.reverse() for item in applyItems: newBound = [None, None] if self.res['applyToTime1'] == 'Start Time': newBound[0] = int(item.startTime) + milliseconds1 newBound[1] = int(item.stopTime) + milliseconds2 elif self.res['applyToTime1'] == 'Stop Time': newBound[0] = int(item.startTime) newBound[1] = int(item.stopTime) + milliseconds1 elif self.res['applyToTime1'] == 'Both Start and Stop Times': newBound[0] = int(item.startTime) + milliseconds1 newBound[1] = int(item.stopTime) + milliseconds1 else: continue nextSub = self.subtitleModel.get_next(item) prevSub = self.subtitleModel.get_prev(item) if newBound[0] < 0: return if nextSub is not None and newBound[1] > nextSub.startTime - 120: continue if prevSub is not None and newBound[0] < prevSub.stopTime + 120: continue if newBound[0] > newBound[1] or newBound[1] - newBound[0] < 1000: continue self.changeList.append((item, int(item.startTime), int(item.stopTime), int(newBound[0]), int(newBound[1]))) item.startTime = int(newBound[0]) item.stopTime = int(newBound[1]) self.destroy()
def on_main_button_click(self, sender, sender_name): if sender_name != 'OK' or any(entry is None for entry in self.res.itervalues()) or self.subtitleModel.is_empty() or not(isfloat(self.res['originalMS']) or isfloat(self.res['targetMS'])): self.destroy() return # Create list of subtitles to apply changes applyItems = [] if self.res['applyTo'] == 'all lines': for item in self.subtitleModel.get_model(): applyItems.append(item[0]) else: self.get_tv_selection() if len(self.tvSelectionList) == 0: self.destroy() return applyItems = self.tvSelectionList[:] origMS = int(self.res['originalMS']) targetMS = int(self.res['targetMS']) for item in applyItems: if (self.res['condition'] == '<' and not(item.duration < origMS)) or (self.res['condition'] == '<=' and not(item.duration < origMS)) or (self.res['condition'] == '>' and not(item.duration > origMS)) or (self.res['condition'] == '>=' and not(item.duration >= origMS)) or (self.res['condition'] == '=' and not(item.duration == origMS)): continue # Check if new duration is less than the minimum allowed minDuration = int ( (20.0 * (item.calc_target_duration() - 500) / 26.9999999999) + 500 ) + 1 targetDuration = item.duration + targetMS * (1 if self.res['operation'] == 'add' else -1) targetDuration = targetDuration if targetDuration >= minDuration else minDuration targetDuration = targetDuration if targetDuration >= 1000 else 1000 # Check if gap is not violated newStop = int(item.stopTime) + (int(targetDuration) - int(item.duration)) nextSub = self.subtitleModel.get_next(item) if nextSub is not None and newStop > nextSub.startTime - 120: newStop = nextSub.startTime - 120 # Check if change in stopTime violates min duration if int(newStop) - int(item.startTime) < int(minDuration): continue # Finally Apply newStop self.changeList.append( (item, int(item.startTime), int(item.stopTime), int(item.startTime), int(newStop)) ) item.stopTime = int(newStop) self.destroy()
def readData(class_name, class_questionnaire='Q92510', data_path=None, missing_input='none', dummy=False, transform_numeric=False, use_text=False, skip_class_questionnaire=True): # attributes are separated by commas (',') # "nan" is assigned to fields with 'N/A' or 'None' print('Reading data...') data = pd.read_csv(data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8') if (not transform_numeric): dummy = False #print(data.columns[-1]) # data = data.dropna(subset=[class_name]) # data = data.drop(np.where([e == 'NAAI' or e == 'NINA' for e in data[data.columns[-1]]])[0]) # print(data.shape) data = data.drop(data.columns[data.columns.str.endswith('id')], 1) data = data.drop(data.columns[data.columns.str.endswith('token')], 1) data = (data.drop(data.columns[data.columns.str.endswith('ipaddr')], 1)) data = (data.drop(data.columns[data.columns.str.endswith('date')], 1)) data = (data.drop(data.columns[data.columns.str.endswith('stamp')], 1)) #data = (data.drop(data.columns[data.columns.str.endswith('participant_code')],1)) data = (data.drop(data.columns[data.columns.str.endswith('datLesao')], 1)) data = (data.drop(data.columns[data.columns.str.endswith('datNasc')], 1)) # data = (data.drop(data.columns[data.columns.str.endswith('Origem')],1)) # data = (data.drop(data.columns[data.columns.str.endswith('Cidade')],1)) # data = (data.drop(data.columns[data.columns.str.endswith('Estado')],1)) # data = (data.drop(data.columns[data.columns.str.endswith('País')],1)) # data = (data.drop(data.columns[data.columns.str.endswith('participant_code')],1)) # data = (data.drop(data.columns[data.columns.str.endswith('Natural de')],1)) #data = ((((data.T).drop_duplicates(keep='first')).dropna(how='all')).T) #dropping columns that are constant #data = data.loc[:,data.apply(pd.Series.nunique) != 1] ## data = pp.preprocess(data_path,class_name) n_samples = data.shape[0] n_features = data.shape[1] regex_date = re.compile( '(\d{4})-(\d{2})-(\d{2})\s?((\d{2}):(\d{2}):(\d{2}))?') treatment = np.empty(n_features, dtype='U5') attributes = [] categories = [] transformedData = [] index = 0 si = 0 print('Transforming data...') ### representing the categories with numbers for attribute in data.columns: if skip_class_questionnaire and class_questionnaire is not None and class_questionnaire in attribute and class_name not in attribute: index += 1 continue # else: # if class_questionnaire in attribute and 'Ombro' not in attribute: # index+=1 # continue t = pd.factorize(data[attribute].values, sort=True) #temp = t[0] i = utils.firstNotNan(data[attribute].values) try: result = regex_date.match(data[attribute].values[i]) if (result): treatment[index] = 'date' elif (len(t[1]) > 0.9 * n_samples and len(t[1][0]) > 50): # if(attribute == 'participant_code'): # temp = t[0] # treatment[index] = 'int' # else: treatment[index] = 'text' else: if (utils.isfloat(data[attribute].values[i])): # index+=1 # continue #temp = [float(x) for x in t[0]] treatment[index] = 'float' elif (not dummy): if (transform_numeric or utils.isint(data[attribute].values[i])): temp = t[0] # if not utils.isint(data[attribute].values[i]): # index += 1 # continue else: temp = data[attribute].values treatment[index] = 'int' else: treatment[index] = 'bin' except TypeError: if (utils.isfloat(data[attribute].values[i])): # index+=1 # continue temp = np.array(data[attribute].values).reshape(-1, 1) treatment[index] = 'float' elif (utils.isint(data[attribute][i])): # index+=1 # continue temp = (np.array(data[attribute].values) * 1).reshape(-1, 1) treatment[index] = 'int' else: print("could not identify type of attribute %s" % attribute) exit(-1) #treatment of class attribute if (class_name in attribute): temp = t[0] treatment[index] = 'int' if (treatment[index] == 'float'): if (missing_input != 'none'): imp = preprocessing.Imputer(strategy=missing_input, axis=0) temp = imp.fit_transform( X=np.array(data[attribute].values).reshape(-1, 1)) else: temp = data[attribute].values #print(np.array(list((float(x) for x in temp))).reshape(-1,1).shape) transformedData.append( np.array(list((float(x) for x in temp))).reshape(-1, 1)) else: # t[0] corresponds to the translated numeric data # t[1] corresponds to a list with the possible values for each feature' # (different values in a column, e.g. [sim, não]). # the index of that value in the list corresponds to its numeric representation # (e.g. [sim, não] -> sim is represented by 0 and não by 1). # if(missing_input != 'none' and treatment[index] != 'bin'): # imp = preprocessing.Imputer(missing_values=-1,strategy=missing_input,axis=0) # temp = imp.fit_transform(X=temp.reshape(-1,1)) if (treatment[index] == 'bin'): #imp = preprocessing.Imputer(missing_values=-1,strategy='mean',axis=0) #temp = imp.fit_transform(X=np.array(temp).reshape(-1,1)) temp = pd.get_dummies(np.ravel(data[attribute].values)) for x in temp.columns: attributes.append(attribute + '=' + x) #print(temp[x].reshape(-1,1).shape) transformedData.append(temp[x].reshape(-1, 1)) elif (treatment[index] == 'int'): if (not transform_numeric): temp = data[attribute].values for temp_index in range(len(temp)): if (isinstance(temp[temp_index], str)): temp[temp_index] = temp[temp_index].upper() i = utils.firstNotNan(data[attribute].values) if (utils.isint(data[attribute].values[i]) and missing_input != 'none'): temp[data[attribute].values == 'NAAI'] = -1 temp[np.isnan( np.array(data[attribute].values, dtype=float))] = -1 imp = preprocessing.Imputer(missing_values=-1, strategy=missing_input, axis=0) temp = imp.fit_transform( X=np.array(list(int(x) for x in temp)).reshape(-1, 1)) elif (missing_input != 'none'): imp = preprocessing.Imputer(missing_values=np.nan, strategy=missing_input, axis=0) temp = imp.fit_transform(X=np.array(temp).reshape(-1, 1)) #print(np.array(temp).reshape(-1,1).shape) transformedData.append(np.array(temp).reshape(-1, 1)) elif (treatment[index] == 'date'): temp = [] for date in data[attribute].values: if (not isinstance(date, float)): temp.append(int(date[:4])) else: temp.append(-1) if (missing_input != 'none'): imp = preprocessing.Imputer(strategy='most_frequent', axis=0) temp = imp.fit_transform(X=np.array(temp).reshape(-1, 1)) #print(np.array(temp).reshape(-1,1).shape) transformedData.append(np.array(temp).reshape(-1, 1)) elif (use_text and treatment[index] == 'text'): #try: bigword = '' #print(attribute) try: bag_of_words = CountVectorizer(min_df=0.25, stop_words=sw, ngram_range=(1, 4)) #print(data[attribute]) words = np.array( bag_of_words.fit_transform( ((data[attribute].values))).todense()) c = 0 for word in bag_of_words.get_feature_names(): bigword = bigword + word + ' ' attributes.append(attribute + ' termo: ' + word) transformedData.append(words[:, c].reshape(-1, 1)) c += 1 # wordcloud = WordCloud(stopwords=sw,background_color='white').generate(bigword,) # plt.imshow(wordcloud) # plt.axis('off') # plt.show() except (ValueError, AttributeError): index += 1 continue else: index += 1 continue # else: # print('undefined option for pre processing: (%s, %s) ' % (categ) ) # exit(-1) categories.append(t[1]) if (treatment[index] != 'text' and treatment[index] != 'bin'): attributes.append(attribute) index += 1 data = np.array(transformedData).reshape(-1, n_samples).T data = pd.DataFrame(data, columns=attributes) # pd.DataFrame(data,columns=attributes).to_csv('out.csv', index=False) # f = open('DorR.csv', 'w') # f.write(','.join(np.array(attributes, dtype=object))) # for d in data: # f.write('\n') # f.write(','.join(str(dd) for dd in d)) # exit() return data
def write_database(self): # create SignificanceEntry significance_entries = [] name_sigentry = {} statistics = self.progress["statistics"] # TODO: BUG: if a location has no entries, but has stat entries, # TODO: it will not show up in the preview but will be written here # TODO: (kinda harmless though, since it won't have any trials mapped to it) for name in statistics: if not statistics[name]: continue fieldname = statistics[name]["fieldname"] alpha = statistics[name]["alpha"] if not alpha: alpha = None comparing = statistics[name]["comparing"] try: cells = self.progress["cells"][name] except KeyError: cells = [] for header, value in cells: if not value or not isfloat(value): continue sigentry = models.SignificanceEntry(comparing=comparing, method=fieldname, alpha=alpha, value=value) significance_entries.append(sigentry) try: name_sigentry[header].append(sigentry) except KeyError: name_sigentry[header] = [sigentry] for sigentry in significance_entries: sigentry.save() # create Date year = int(self.progress["date"]) date_pd = datetime.date(year=year, month=5, day=1) date_hd = datetime.date(year=year, month=8, day=1) try: plant_date = models.Date.objects.get(date=date_pd) except: plant_date = models.Date(date=date_pd) plant_date.save() try: harvest_date = models.Date.objects.get(date=date_hd) except: harvest_date = models.Date(date=date_hd) harvest_date.save() # create/update TrialEntry objects entries = [] lpvh_entries = {} location_names = [] variety_names = [] planting_methods = [] for trial in models.TrialEntry.objects.filter(harvest_date=harvest_date): lpvh = (trial.location.pk, trial.planting_method_tags.pk, trial.variety.pk, trial.harvest_date.pk) lpvh_entries[lpvh] = trial for entry in self.progress["trial_entries"]: location = entry["location"]["pk"] planting_methods = entry["location"]["planting_methods"] variety = entry["variety"]["pk"] lpvh = (location, planting_methods, variety, harvest_date.pk) try: trial = lpvh_entries[lpvh] except KeyError: trial = models.TrialEntry( location_id=location, variety_id=variety, planting_method_tags_id=planting_methods, harvest_date=harvest_date, plant_date=plant_date, ) lpvh_entries[lpvh] = trial measure = entry["measure"]["fieldname"] value = entry["measure"]["value"] if value is None: continue setattr(trial, measure, value) entries.append(trial) trial.save() # grab info for a later requery of these trials location_names.append(trial.location.name) variety_names.append(trial.variety.name) planting_methods.append(trial.planting_method_tags.planting_method) # attach TrialEntry objects to SignificanceEntry objects for name, sigentries in name_sigentry.items(): # name is a user-entered string, lookup in our mappings # TODO: assume name is a location for now location = self.progress["locations"][name]["pk"] planting_methods = self.progress["locations"][name]["planting_methods"] # TODO: avoid the following database call trials = list( models.TrialEntry.objects.filter(location_id=location) .filter(planting_method_tags_id=planting_methods) .filter(harvest_date=harvest_date) ) for sigentry in sigentries: sigentry.trials.add(*trials) sigentry.save() # save TrialEntry query for later editing self.progress["submitted"]["locations"] = list(set(location_names)) self.progress["submitted"]["varieties"] = list(set(variety_names)) self.progress["submitted"]["planting_methods"] = list(set(planting_methods)) self.progress["submitted"]["harvest_date"] = { "year": harvest_date.date.year, "month": harvest_date.date.month, "day": harvest_date.date.day, } self.save() self.model.submitted = True self.model.save()
def prepare_table(self): ## collect entered data trial_entries = {} summary_entries = {} locations = [] varieties = [] measures = [] # locations, varieties, measures lnames = self.progress["headers"][HeaderTypes.Locations] vnames = self.progress["headers"][HeaderTypes.Varieties] mnames = self.progress["headers"][HeaderTypes.Measures] if not lnames and not vnames and not mnames: pass elif not lnames and not vnames: pass elif not vnames and not mnames: pass elif not lnames and not mnames: pass else: if not lnames: lnames = [None] if not vnames: vnames = [None] if not mnames: mnames = [None] # statistics snames = self.progress["headers"][HeaderTypes.Statistics] if not snames: snames = [] # define get_cell row_type = self.progress["headers"]["rows"] col_type = self.progress["headers"]["columns"] if row_type == HeaderTypes.Locations: get_row = lambda (ln, vn, mn): ln elif row_type == HeaderTypes.Varieties: get_row = lambda (ln, vn, mn): vn elif row_type == HeaderTypes.Measures: get_row = lambda (ln, vn, mn): mn else: # HeaderTypes.Data get_row = lambda (ln, vn, mn): None if col_type == HeaderTypes.Locations: get_col = lambda (ln, vn, mn): ln elif col_type == HeaderTypes.Varieties: get_col = lambda (ln, vn, mn): vn elif col_type == HeaderTypes.Measures: get_col = lambda (ln, vn, mn): mn else: # HeaderTypes.Data get_col = lambda (ln, vn, mn): None def get_cell(ln, vn, mn): row = get_row((ln, vn, mn)) col = get_col((ln, vn, mn)) cell = (None, None) if row is None: tmp = col col = row row = tmp try: rows = self.progress["cells"][row] except KeyError: rows = [] for col_val in rows: if col_val[0] == col: cell = col_val break return cell # collect entries for lname in lnames: location = self.progress["locations"][lname] location_name = "{}-{}".format(location["name"], location["planting_methods_text"]) locations.append(location_name) summary_entries[lname] = {} for vname in vnames: variety = self.progress["varieties"][vname] variety_name = variety["name"] varieties.append(variety_name) for mname in mnames: measure = self.progress["measures"][mname].copy() measure_name = measure["fieldname"] measures.append(measure_name) val = get_cell(lname, vname, mname)[1] if not isfloat(val): val = None measure["value"] = val entry = {"location": location, "variety": variety, "measure": measure} trial_entries[(location_name, variety_name, measure_name)] = entry for sname in snames: cells = self.progress["cells"][sname] cell = (None, None) for col_val in cells: if col_val[0] == lname: cell = col_val header = lname break elif col_val[0] == vname: cell = col_val header = vname break elif col_val[0] == mname: cell = col_val header = mname break sentry = {"value": cell[1]} summary_entries[header][sname] = sentry # collect summary_entries for name in []: stats = {} stat = {} stat["value"] = sval stats[name] = stat summary_entries[location_name] = stats ## construct main table locations = sorted(list(set(locations))) varieties = sorted(list(set(varieties))) measures = sorted(list(set(measures))) table = [] for v in varieties: row = [] for l in locations: cell = [] for m in measures: try: item = trial_entries[(l, v, m)] except KeyError: item = None cell.append(item) row.append(cell) table.append(row) ## construct summary table (statistic entries) summary = [] prettystatnames = [] statistics = self.progress["statistics"] havestats = True if None in statistics and not statistics[None]: havestats = False if havestats: statmap = {} statnames = [] # make a mapping from pretty names to user-input names for name in statistics: pretty = "{} ({})".format(statistics[name]["fieldname"], statistics[name]["alpha"]) statmap[pretty] = name # sort pretty names prettystatnames = sorted(statmap.keys()) # ensure we iterate using the ordering we just made over pretty names for name in prettystatnames: statnames.append(statmap[name]) for name in statnames: row = [] for l in lnames: # only show stats that compare across locations try: val = summary_entries[l][name]["value"] except KeyError: val = None row.append({"value": val, "comparing": statistics[name]["comparing"]}) summary.append(row) ## bundle it up and send it along year = self.progress["date"] self.progress["trial_entries"] = trial_entries.values() # self.progress['table'] = table # self.progress['summary'] = summary self.save() return year, table, summary, prettystatnames, locations, varieties
def swap_tracker_json(trmnt, user_id): my_buyin = Buy_ins.get_latest( user_id=user_id, tournament_id=trmnt.id ) final_profit = 0 swaps = Swaps.query.filter_by( sender_id = user_id, tournament_id = trmnt.id ) # separate swaps by recipient id swaps_by_recipient = {} for swap in swaps: rec_id = str(swap.recipient_id) data = swaps_by_recipient.get( rec_id, [] ) swaps_by_recipient[ rec_id ] = [ *data, swap ] # Loop through swaps to create swap tracker json and append to 'swaps_buyins' swaps_buyins = [] for rec_id, swaps in swaps_by_recipient.items(): recipient_buyin = Buy_ins.get_latest( user_id = rec_id, tournament_id = trmnt.id ) # Catch ERRORS if recipient_buyin is None or my_buyin is None: swap_data_for_error_message = [{ 'recipient_name': f'{x.recipient_user.first_name} {x.recipient_user.last_name}', 'sender_name': f'{x.sender_user.first_name} {x.sender_user.last_name}', 'tournament_name': x.tournament.name } for x in swaps] if recipient_buyin is None: return { 'ERROR':'Recipient has swaps with user in this tournament but no buy-in', 'recipient buyin': None, 'swaps with user': swap_data_for_error_message } if my_buyin is None: return { 'ERROR':'User has swaps in this tournament but no buy-in', 'buyin': None, 'user swaps': swap_data_for_error_message } # Structure and fill most properties for swap tracker json recipient_user = Profiles.query.get( rec_id ) data = { 'recipient_user': recipient_user.serialize(), 'recipient_buyin': recipient_buyin.serialize(), 'their_place': recipient_buyin.place, 'you_won': my_buyin.winnings if my_buyin.winnings else 0, 'they_won': recipient_buyin.winnings if recipient_buyin.winnings else 0, 'available_percentage': recipient_user.available_percentage(trmnt.id), 'agreed_swaps': [], 'other_swaps': [] } # Fill in properties: 'agreed_swaps' and 'other_swaps' lists you_owe_total = 0 they_owe_total = 0 for swap in swaps: single_swap_data = { **swap.serialize(), 'counter_percentage': swap.counter_swap.percentage, 'they_paid': swap.counter_swap.paid, 'they_confirmed': swap.counter_swap.confirmed, } if swap.status._value_ == 'agreed': you_owe = ( float(my_buyin.winnings) * swap.percentage / 100 ) \ if isfloat(my_buyin.winnings) else 0 they_owe = ( float(recipient_buyin.winnings) * swap.counter_swap.percentage / 100 ) \ if isfloat(recipient_buyin.winnings) else 0 you_owe_total += you_owe they_owe_total += they_owe data['agreed_swaps'].append({ **single_swap_data, 'you_owe': you_owe, 'they_owe': they_owe }) else: data['other_swaps'].append(single_swap_data) # Fill in final properties data['you_owe_total'] = you_owe_total data['they_owe_total'] = they_owe_total final_profit -= you_owe_total final_profit += they_owe_total # IF user doesn't owe anything to other guy, # make all agreed swaps paid for themselves # if final_profit >= 0: # for swap in data['agreed_swaps']: # # swap['paid'] = True # the_swap = Swaps.query.get( swap.id) # print('the_swap', the_swap) # the_swap.paid = True # the_swap.paid_at = datetime.utcnow() # the_swap.confirmed = True # the_swap.confirmed_at = datetime.utcnow() # Append json swaps_buyins.append(data) return { 'tournament': trmnt.serialize(), 'my_buyin': my_buyin and my_buyin.serialize(), 'buyins': swaps_buyins, 'final_profit': final_profit }
def test_isfloat(self): true = utils.isfloat("123.45") false = utils.isfloat("hey") self.assertTrue(true) self.assertFalse(false)
def get_tournaments(user_id, id): if id == 'all': now = datetime.utcnow() - timedelta(days=1) # Filter past tournaments if request.args.get('history') == 'true': trmnts = Tournaments.get_history() # Filter current and future tournaments else: trmnts = Tournaments.get_live_upcoming() # Filter by name name = request.args.get('name') if name is not None: trmnts = trmnts.filter(Tournaments.name.ilike(f'%{name}%')) # Order by zip code zip = request.args.get('zip', '') if zip.isnumeric(): path = os.environ['APP_PATH'] with open(path + '/src/zip_codes.json') as zip_file: data = json.load(zip_file) zipcode = data.get(zip) if zipcode is None: raise APIException('Zipcode not in file', 500) lat = zipcode['latitude'] lon = zipcode['longitude'] # Order by user location else: lat = request.args.get('lat', '') lon = request.args.get('lon', '') if isfloat(lat) and isfloat(lon): trmnts = trmnts.order_by( (db.func.abs(float(lon) - Tournaments.longitude) + db.func.abs(float(lat) - Tournaments.latitude)).asc()) # Order by ascending date elif request.args.get('asc') == 'true': trmnts = trmnts.order_by(Tournaments.start_at.asc()) # Order by descending date elif request.args.get('desc') == 'true': trmnts = trmnts.order_by(Tournaments.start_at.desc()) # Pagination offset, limit = utils.resolve_pagination(request.args) trmnts = trmnts.offset(offset).limit(limit) return jsonify([ actions.swap_tracker_json(trmnt, user_id) for trmnt in trmnts ]), 200 # Single tournament by id elif id.isnumeric(): trmnt = Tournaments.query.get(int(id)) if trmnt is None: raise APIException('Tournament not found', 404) return jsonify(actions.swap_tracker_json(trmnt, user_id)), 200 raise APIException('Invalid id', 400)
def get_tournaments(user_id, id): # List Flights if id == 'all': # Order by date: ascending or descending order_method = None if request.args.get('asc') == 'true': order_method = Flights.start_at.asc() elif request.args.get('desc') == 'true': order_method = Flights.start_at.desc() # Filter past flights and order by default asc if request.args.get('history') == 'true': flights = Flights.get(history=True) flights = flights.order_by(Flights.start_at.desc( ) if order_method is None else order_method) # Filter current and future flights and order by default desc else: flights = Flights.get(history=False) flights = flights.order_by(Flights.start_at.asc( ) if order_method is None else order_method) # Filter by name name = request.args.get('name') if name is not None: flights = flights.filter( Flights.tournament.has( Tournaments.name.ilike(f'%{name}%'))) # Get zip code LAT LON zip = request.args.get('zip', '') if zip.isnumeric(): path = os.environ['APP_PATH'] with open(path + '/src/zip_codes.json') as zip_file: data = json.load(zip_file) zipcode = data.get(zip) if zipcode is None: raise APIException('Zipcode not in file', 500) lat = zipcode['latitude'] lon = zipcode['longitude'] # Get user LAT LON else: lat = request.args.get('lat', '') lon = request.args.get('lon', '') # Order flights by distance, whithin the day if isfloat(lat) and isfloat(lon): flights = [{ 'flight': f, 'distance': utils.distance(origin=[float(lat), float(lon)], destination=[ f.tournament.latitude, f.tournament.longitude ]) } for f in flights] flights = sorted(flights, key=cmp_to_key(utils.sort_by_location)) # Pagination offset, limit = utils.resolve_pagination(request.args) flights = flights[offset:offset + limit] return jsonify([{ **x['flight'].serialize(), 'casino': x['flight'].tournament.casino, 'address': x['flight'].tournament.address, 'city': x['flight'].tournament.city, 'state': x['flight'].tournament.state, 'zip_code': x['flight'].tournament.zip_code, 'buy_in': Buy_ins.get_latest(user_id, x['flight'].tournament_id) is not None, 'distance': x['distance'] } for x in flights]), 200 else: # Pagination offset, limit = utils.resolve_pagination(request.args) flights = flights.offset(offset).limit(limit) return jsonify([{ **f.serialize(), 'casino': f.tournament.casino, 'address': f.tournament.address, 'city': f.tournament.city, 'state': f.tournament.state, 'zip_code': f.tournament.zip_code, 'buy_in': Buy_ins.get_latest(user_id, f.tournament_id) is not None } for f in flights]), 200 # Single tournament by id elif id.isnumeric(): trmnt = Tournaments.query.get(int(id)) if trmnt is None: raise APIException('Tournament not found', 404) return jsonify(actions.swap_tracker_json(trmnt, user_id)), 200 raise APIException('Invalid id', 400)
def on_main_button_click(self, sender, sender_name): if sender_name != 'OK': self.destroy() return if self.res['applyToSubs'] is None or self.res[ 'milliseconds1'] is None or self.res[ 'operation1'] is None or self.res['applyToTime1'] is None: self.destroy() return if self.res['applyToTime1'] == 'Start Time' and ( self.res['operation2'] is not None and not isfloat(self.res['milliseconds2'])): self.destroy() return if not isfloat(self.res['milliseconds1']): self.destroy() return # Create list of subtitles to apply changes applyItems = [] if self.res['applyToSubs'] == 'all lines': for item in self.subtitleModel.get_model(): applyItems.append(item[0]) else: self.get_tv_selection() if len(self.tvSelectionList) == 0: self.destroy() return applyItems = self.tvSelectionList[:] if self.res['milliseconds1'] is not None: milliseconds1 = int(self.res['milliseconds1']) * ( 1 if self.res['operation1'] == 'add' else -1) else: milliseconds1 = 0 if self.res['milliseconds2'] is not None: milliseconds2 = int(self.res['milliseconds2']) * ( 1 if self.res['operation1'] == 'add' else -1) else: milliseconds2 = 0 operation1Items = applyItems[:] if milliseconds1 > 0: operation1Items.reverse() for item in applyItems: newBound = [None, None] if self.res['applyToTime1'] == 'Start Time': newBound[0] = int(item.startTime) + milliseconds1 newBound[1] = int(item.stopTime) + milliseconds2 elif self.res['applyToTime1'] == 'Stop Time': newBound[0] = int(item.startTime) newBound[1] = int(item.stopTime) + milliseconds1 elif self.res['applyToTime1'] == 'Both Start and Stop Times': newBound[0] = int(item.startTime) + milliseconds1 newBound[1] = int(item.stopTime) + milliseconds1 else: continue nextSub = self.subtitleModel.get_next(item) prevSub = self.subtitleModel.get_prev(item) if newBound[0] < 0: return if nextSub is not None and newBound[1] > nextSub.startTime - 120: continue if prevSub is not None and newBound[0] < prevSub.stopTime + 120: continue if newBound[0] > newBound[1] or newBound[1] - newBound[0] < 1000: continue self.changeList.append( (item, int(item.startTime), int(item.stopTime), int(newBound[0]), int(newBound[1]))) item.startTime = int(newBound[0]) item.stopTime = int(newBound[1]) self.destroy()
def plot_feature_contributions_surgery_class(X, y, feature_index, fcs, attributes, class_of_interest, title=None): surgery_index = np.where(attributes == 'Q44071_snCplexoAt')[0][0] if (not utils.isint(X[utils.firstNotNan( X[:, feature_index])][feature_index]) and not utils.isfloat( X[utils.firstNotNan(X[:, feature_index])][feature_index])): values = [i for i in set(X[:, feature_index]) if not utils.isnan(i) ] + [np.nan] x_surgery = [] surgery_colors = [] x_no_surgery = [] no_surgery_colors = [] x_nan = [] nan_colors = [] y_surgery = [] y_no_surgery = [] y_nan = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'): x_surgery.append(fcs[i][feature_index][class_of_interest]) y_surgery.append(values.index(X[i][feature_index])) if (y[i] == class_of_interest): surgery_colors.append('blue') else: surgery_colors.append('red') elif (utils.isnan(X[i][surgery_index])): x_nan.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values y_nan.append(len(values) - 1) if (y[i] == class_of_interest): nan_colors.append('blue') else: nan_colors.append('red') else: x_no_surgery.append( fcs[i][feature_index][class_of_interest]) y_no_surgery.append(values.index(X[i][feature_index])) if (y[i] == class_of_interest): no_surgery_colors.append('blue') else: no_surgery_colors.append('red') # if(X[i][feature_index] not in contributions.keys()): # contributions[X[i][feature_index]] = [fcs[i][feature_index][class_of_interest]] # else: # contributions[X[i][feature_index]].append(fcs[i][feature_index][class_of_interest]) coi = str(class_of_interest) ax = plt.subplot(111) ax.scatter(x_surgery, y_surgery, marker='o', s=60, edgecolors=surgery_colors, facecolors='none') ax.scatter(x_no_surgery, y_no_surgery, marker='x', s=60, edgecolors=no_surgery_colors, facecolors='none') ax.scatter(x_nan, y_nan, marker='d', s=60, edgecolors=nan_colors, facecolors='none') plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticks(np.array(range(len(values) + 2)) - 1) ax.set_yticklabels([str('')] + values + [str('')]) red_patch = mpatches.Patch(color='red') blue_patch = mpatches.Patch(color='blue') xmarker = mlines.Line2D([], [], color='black', marker='x', markersize=10, linestyle='None') omarker = mlines.Line2D([], [], color='black', marker='o', markersize=10, linestyle='None', markerfacecolor='None', markeredgecolor='black') #plt.legend(handles=[red_patch,blue_patch]) plt.legend([red_patch, blue_patch, xmarker, omarker], [ 'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi, 'Não passou por cirurgia', 'Passou por cirurgia' ], numpoints=1, fontsize='small') plt.show() else: values = sorted([ round(i, 4) for i in (set(X[:, feature_index])) if not utils.isnan(i) ]) # + [np.nan] print(values) nan_index = values[-1] - values[-2] x_surgery = [] surgery_colors = [] x_no_surgery = [] no_surgery_colors = [] x_nan = [] nan_colors = [] y_surgery = [] y_no_surgery = [] y_nan = [] for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'): x_surgery.append(fcs[i][feature_index][class_of_interest]) y_surgery.append((X[i][feature_index])) if (y[i] == class_of_interest): surgery_colors.append('blue') else: surgery_colors.append('red') elif (utils.isnan(X[i][surgery_index])): x_nan.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values y_nan.append(values[-1] + nan_index) if (y[i] == class_of_interest): nan_colors.append('blue') else: nan_colors.append('red') else: x_no_surgery.append( fcs[i][feature_index][class_of_interest]) y_no_surgery.append((X[i][feature_index])) if (y[i] == class_of_interest): no_surgery_colors.append('blue') else: no_surgery_colors.append('red') coi = str(class_of_interest) fig, ax = plt.subplots() ax.scatter(x_surgery, y_surgery, marker='o', s=60, facecolors='none', edgecolors=surgery_colors) ax.scatter(x_no_surgery, y_no_surgery, marker='x', s=60, edgecolors=no_surgery_colors) ax.scatter(x_nan, y_nan, marker='d', s=60, facecolors='none', edgecolors=nan_colors) fig.canvas.draw() labels = [''] + [item.get_text() for item in ax.get_yticklabels()] + [''] if (values[-1] + nan_index < ax.get_yticks()[-1]): plt.yticks( [values[0] - nan_index] + sorted(list(ax.get_yticks()) + [values[-1] + nan_index])) else: plt.yticks([values[0] - nan_index] + sorted( list(ax.get_yticks()) + [values[-1] + nan_index, values[-1] + 2 * nan_index])) labels[-2] = 'nan' plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticklabels(labels) red_patch = mpatches.Patch(color='red') blue_patch = mpatches.Patch(color='blue') xmarker = mlines.Line2D([], [], color='black', marker='x', markersize=10, label='Bla', linestyle='None') omarker = mlines.Line2D([], [], color='black', marker='o', markersize=10, label='Bla', linestyle='None', markerfacecolor='None', markeredgecolor='black') #plt.legend(handles=[red_patch,blue_patch]) plt.legend([red_patch, blue_patch, xmarker, omarker], [ 'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi, 'Não passou por cirurgia', 'Passou por cirurgia' ], numpoints=1, fontsize='small') plt.show() if (title is not None): plt.savefig(title) plt.close() f = open(title, 'w') f.write('X=' + str(X)) f.write('\ny=' + str(y)) f.write('\nfcs=' + str(fcs)) f.write('\nfeatures=' + str(attributes)) f.write('\nfeature_index=' + str(feature_index)) f.write('\nvalues=' + str(values)) f.write('\nx_surgery=' + str(x_surgery)) f.write('\ny_surgery=' + str(y_surgery)) f.write('\nsurgery_colors=' + str(surgery_colors)) f.write('\nx_no_surgery=' + str(x_no_surgery)) f.write('\ny_no_surgery=' + str(y_no_surgery)) f.write('\nno_surgery_colors=' + str(no_surgery_colors)) f.write('\nx_nan=' + str(x_nan)) f.write('\ny_nan=' + str(y_nan)) f.write('\nnan_colors=' + str(nan_colors))
def plot_feature_contributions(X, feature_index, fcs, attributes, class_of_interest, title=None): if (not utils.isint(X[utils.firstNotNan( X[:, feature_index])][feature_index]) and not utils.isfloat( X[utils.firstNotNan(X[:, feature_index])][feature_index])): values = [i for i in set(X[:, feature_index]) if not utils.isnan(i) ] + [np.nan] pos_fcs = [] neg_fcs = [] pos_values = [] neg_values = [] zero_fcs = [] zero_values = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (fcs[i][feature_index][class_of_interest] > 0): pos_fcs.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values if (utils.isnan(X[i][feature_index])): pos_values.append(len(values) - 1) else: pos_values.append(values.index(X[i][feature_index])) elif (fcs[i][feature_index][class_of_interest] == 0): zero_fcs.append(0) if (utils.isnan(X[i][feature_index])): zero_values.append(len(values) - 1) else: zero_values.append(values.index(X[i][feature_index])) else: neg_fcs.append(fcs[i][feature_index][class_of_interest]) if (utils.isnan(X[i][feature_index])): neg_values.append(len(values) - 1) else: neg_values.append(values.index(X[i][feature_index])) if (X[i][feature_index] not in contributions.keys()): contributions[X[i][feature_index]] = [ fcs[i][feature_index][class_of_interest] ] else: contributions[X[i][feature_index]].append( fcs[i][feature_index][class_of_interest]) print('Contributions:') for value in contributions.keys(): print('Value %r' % value) print( '\nMean: %r Variance: %r' % (np.mean(contributions[value]), np.var(contributions[value]))) c = (contributions.items()) boxplot([a[1] for a in c], [a[0] for a in c], title=None) ax = plt.subplot(111) plt.plot(pos_fcs, pos_values, 'x', color='blue') plt.plot(neg_fcs, neg_values, 'x', color='red') plt.plot(zero_fcs, zero_values, 'x', color='black') plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticks(np.array(range(len(values) + 2)) - 1) ax.set_yticklabels([str('')] + values + [str('')]) plt.show() else: values = sorted([ round(i, 4) for i in (set(X[:, feature_index])) if not utils.isnan(i) ]) # + [np.nan] nan_index = values[-1] - values[-2] pos_fcs = [] neg_fcs = [] pos_values = [] neg_values = [] zero_fcs = [] zero_values = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (fcs[i][feature_index][class_of_interest] > 0): pos_fcs.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values if (utils.isnan(X[i][feature_index])): pos_values.append(values[-1] + nan_index) else: pos_values.append(X[i][feature_index]) elif (fcs[i][feature_index][class_of_interest] == 0): zero_fcs.append(0) if (utils.isnan(X[i][feature_index])): zero_values.append(values[-1] + nan_index) else: zero_values.append(X[i][feature_index]) else: neg_fcs.append(fcs[i][feature_index][class_of_interest]) if (utils.isnan(X[i][feature_index])): neg_values.append(values[-1] + nan_index) else: neg_values.append((X[i][feature_index])) if (utils.isnan(X[i][feature_index])): if ('nan' in contributions.keys()): contributions['nan'].append( fcs[i][feature_index][class_of_interest]) else: contributions['nan'] = [ fcs[i][feature_index][class_of_interest] ] elif (X[i][feature_index] in contributions.keys()): contributions[(X[i][feature_index])].append( fcs[i][feature_index][class_of_interest]) else: contributions[(X[i][feature_index])] = [ fcs[i][feature_index][class_of_interest] ] print('Contributions:') for value in contributions.keys(): print('Value %r' % value) print( 'Mean: %r Variance: %r' % (np.mean(contributions[value]), np.std(contributions[value]))) c = (contributions.items()) boxplot([a[1] for a in c], [a[0] for a in c], title=None) fig, ax = plt.subplots() plt.plot(pos_fcs, pos_values, 'x', color='blue') plt.plot(neg_fcs, neg_values, 'x', color='red') plt.plot(zero_fcs, zero_values, 'x', color='black') fig.canvas.draw() labels = [''] + [item.get_text() for item in ax.get_yticklabels()] + [''] if (values[-1] + nan_index < ax.get_yticks()[-1]): plt.yticks( [values[0] - nan_index] + sorted(list(ax.get_yticks()) + [values[-1] + nan_index])) else: plt.yticks([values[0] - nan_index] + sorted( list(ax.get_yticks()) + [values[-1] + nan_index, values[-1] + 2 * nan_index])) labels[-2] = 'nan' plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticklabels(labels) plt.show() if (title is not None): plt.savefig(title) plt.close()
dummy = False transform = False use_text = False print('Testing utils...') assert (utils.isint(10)) assert (utils.isint('50')) assert (utils.isint('-999')) assert (not utils.isint(1.0)) assert (not utils.isint('50.0')) assert (utils.isint(True)) assert (not utils.isint('aba')) assert (not utils.isint('a?a')) assert (not utils.isint('49.x')) assert (not utils.isfloat('0.x')) assert (utils.isfloat('0.0')) assert (utils.isfloat('12.984')) assert (utils.isfloat('-0.4')) assert (not utils.isfloat('9')) original_attributes = np.array( ['Outlook', 'Temp', 'Humidity', 'Windy?', 'Class']) data = pd.DataFrame( [['SUNNY', 75, 70, 'T', 'PLAY'], ['SUNNY', 80, 90, 'T', "DON'T PLAY"], ['SUNNY', 85, 85, 'F', "DON'T PLAY"], [ 'SUNNY', 72, 95, 'F', "DON'T PLAY" ], ['SUNNY', 69, 70, 'F', 'PLAY'], ['OVERCAST', 72, 90, 'T', 'PLAY'], ['OVERCAST', 83, 78, 'F', 'PLAY'], ['OVERCAST', 64, 65, 'T', 'PLAY'], ['OVERCAST', 81, 75, 'F', 'PLAY'], ['RAIN', 71, 80, 'T', "DON'T PLAY"], ['RAIN', 65, 70, 'T', "DON'T PLAY"], ['RAIN', 75, 80, 'F', 'PLAY'],
def write_database(self): # create SignificanceEntry significance_entries = [] name_sigentry = {} statistics = self.progress['statistics'] # TODO: BUG: if a location has no entries, but has stat entries, # TODO: it will not show up in the preview but will be written here # TODO: (kinda harmless though, since it won't have any trials mapped to it) for name in statistics: if not statistics[name]: continue fieldname = statistics[name]['fieldname'] alpha = statistics[name]['alpha'] if not alpha: alpha = None comparing = statistics[name]['comparing'] try: cells = self.progress['cells'][name] except KeyError: cells = [] for header, value in cells: if not value or not isfloat(value): continue sigentry = models.SignificanceEntry( comparing=comparing, method=fieldname, alpha=alpha, value=value, ) significance_entries.append(sigentry) try: name_sigentry[header].append(sigentry) except KeyError: name_sigentry[header] = [sigentry] for sigentry in significance_entries: sigentry.save() # create Date year = int(self.progress['date']) date_pd = datetime.date(year=year, month=5, day=1) date_hd = datetime.date(year=year, month=8, day=1) try: plant_date = models.Date.objects.get(date=date_pd) except: plant_date = models.Date(date=date_pd) plant_date.save() try: harvest_date = models.Date.objects.get(date=date_hd) except: harvest_date = models.Date(date=date_hd) harvest_date.save() # create/update TrialEntry objects entries = [] lpvh_entries = {} location_names = [] variety_names = [] planting_methods = [] for trial in models.TrialEntry.objects.filter( harvest_date=harvest_date): lpvh = (trial.location.pk, trial.planting_method_tags.pk, trial.variety.pk, trial.harvest_date.pk) lpvh_entries[lpvh] = trial for entry in self.progress['trial_entries']: location = entry['location']['pk'] planting_methods = entry['location']['planting_methods'] variety = entry['variety']['pk'] lpvh = (location, planting_methods, variety, harvest_date.pk) try: trial = lpvh_entries[lpvh] except KeyError: trial = models.TrialEntry( location_id=location, variety_id=variety, planting_method_tags_id=planting_methods, harvest_date=harvest_date, plant_date=plant_date, ) lpvh_entries[lpvh] = trial measure = entry['measure']['fieldname'] value = entry['measure']['value'] if value is None: continue setattr(trial, measure, value) entries.append(trial) trial.save() # grab info for a later requery of these trials location_names.append(trial.location.name) variety_names.append(trial.variety.name) planting_methods.append(trial.planting_method_tags.planting_method) # attach TrialEntry objects to SignificanceEntry objects for name, sigentries in name_sigentry.items(): # name is a user-entered string, lookup in our mappings # TODO: assume name is a location for now location = self.progress['locations'][name]['pk'] planting_methods = self.progress['locations'][name][ 'planting_methods'] # TODO: avoid the following database call trials = list( models.TrialEntry.objects.filter(location_id=location).filter( planting_method_tags_id=planting_methods).filter( harvest_date=harvest_date)) for sigentry in sigentries: sigentry.trials.add(*trials) sigentry.save() # save TrialEntry query for later editing self.progress['submitted']['locations'] = list(set(location_names)) self.progress['submitted']['varieties'] = list(set(variety_names)) self.progress['submitted']['planting_methods'] = list( set(planting_methods)) self.progress['submitted']['harvest_date'] = { 'year': harvest_date.date.year, 'month': harvest_date.date.month, 'day': harvest_date.date.day, } self.save() self.model.submitted = True self.model.save()
def on_zoomEntry_change(self, sender): if isfloat(sender.get_text()): if float(sender.get_text()) == 0: return self.preferences['Zoom'] = float(sender.get_text())
def _fmt_pair(k,v): if isfloat(v): v = sfrounds(v,nsigfigs) else: v = repr(v) return '{0}={1}'.format(k,v)
def prepare_table(self): ## collect entered data trial_entries = {} summary_entries = {} locations = [] varieties = [] measures = [] # locations, varieties, measures lnames = self.progress['headers'][HeaderTypes.Locations] vnames = self.progress['headers'][HeaderTypes.Varieties] mnames = self.progress['headers'][HeaderTypes.Measures] if not lnames and not vnames and not mnames: pass elif not lnames and not vnames: pass elif not vnames and not mnames: pass elif not lnames and not mnames: pass else: if not lnames: lnames = [None] if not vnames: vnames = [None] if not mnames: mnames = [None] # statistics snames = self.progress['headers'][HeaderTypes.Statistics] if not snames: snames = [] # define get_cell row_type = self.progress['headers']['rows'] col_type = self.progress['headers']['columns'] if row_type == HeaderTypes.Locations: get_row = lambda (ln, vn, mn): ln elif row_type == HeaderTypes.Varieties: get_row = lambda (ln, vn, mn): vn elif row_type == HeaderTypes.Measures: get_row = lambda (ln, vn, mn): mn else: # HeaderTypes.Data get_row = lambda (ln, vn, mn): None if col_type == HeaderTypes.Locations: get_col = lambda (ln, vn, mn): ln elif col_type == HeaderTypes.Varieties: get_col = lambda (ln, vn, mn): vn elif col_type == HeaderTypes.Measures: get_col = lambda (ln, vn, mn): mn else: # HeaderTypes.Data get_col = lambda (ln, vn, mn): None def get_cell(ln, vn, mn): row = get_row((ln, vn, mn)) col = get_col((ln, vn, mn)) cell = (None, None) if row is None: tmp = col col = row row = tmp try: rows = self.progress['cells'][row] except KeyError: rows = [] for col_val in rows: if col_val[0] == col: cell = col_val break return cell # collect entries for lname in lnames: location = self.progress['locations'][lname] location_name = '{}-{}'.format(location['name'], location['planting_methods_text']) locations.append(location_name) summary_entries[lname] = {} for vname in vnames: variety = self.progress['varieties'][vname] variety_name = variety['name'] varieties.append(variety_name) for mname in mnames: measure = self.progress['measures'][mname].copy() measure_name = measure['fieldname'] measures.append(measure_name) val = get_cell(lname, vname, mname)[1] if not isfloat(val): val = None measure['value'] = val entry = { 'location': location, 'variety': variety, 'measure': measure, } trial_entries[(location_name, variety_name, measure_name)] = entry for sname in snames: cells = self.progress['cells'][sname] cell = (None, None) for col_val in cells: if col_val[0] == lname: cell = col_val header = lname break elif col_val[0] == vname: cell = col_val header = vname break elif col_val[0] == mname: cell = col_val header = mname break sentry = { 'value': cell[1], } summary_entries[header][sname] = sentry # collect summary_entries for name in []: stats = {} stat = {} stat['value'] = sval stats[name] = stat summary_entries[location_name] = stats ## construct main table locations = sorted(list(set(locations))) varieties = sorted(list(set(varieties))) measures = sorted(list(set(measures))) table = [] for v in varieties: row = [] for l in locations: cell = [] for m in measures: try: item = trial_entries[(l, v, m)] except KeyError: item = None cell.append(item) row.append(cell) table.append(row) ## construct summary table (statistic entries) summary = [] prettystatnames = [] statistics = self.progress['statistics'] havestats = True if None in statistics and not statistics[None]: havestats = False if havestats: statmap = {} statnames = [] # make a mapping from pretty names to user-input names for name in statistics: pretty = '{} ({})'.format(statistics[name]['fieldname'], statistics[name]['alpha']) statmap[pretty] = name # sort pretty names prettystatnames = sorted(statmap.keys()) # ensure we iterate using the ordering we just made over pretty names for name in prettystatnames: statnames.append(statmap[name]) for name in statnames: row = [] for l in lnames: # only show stats that compare across locations try: val = summary_entries[l][name]['value'] except KeyError: val = None row.append({ 'value': val, 'comparing': statistics[name]['comparing'] }) summary.append(row) ## bundle it up and send it along year = self.progress['date'] self.progress['trial_entries'] = trial_entries.values() #self.progress['table'] = table #self.progress['summary'] = summary self.save() return year, table, summary, prettystatnames, locations, varieties