def incidentMetaData(df, dfPerKm, company, lang): def most_common_substance(df, meta, lang): if lang == 'en': df_substance = df[df['Substance'] != "Not Applicable"].copy() else: df_substance = df[df['Substance'] != "Sans object"].copy() meta = most_common(df_substance, meta, "Substance", "mostCommonSubstance") return meta def other_types(df, lang): if lang == 'en': serious = {"Adverse Environmental Effects": 0, "Fatality": 0, "Serious Injury (CER or TSB)": 0} else: df['Incident Types'] = df['Incident Types'].replace({'Effets environnementaux négatifs': 'Adverse Environmental Effects', 'Blessure grave (Régie ou BST)': 'Serious Injury (CER or TSB)', 'Décès': 'Fatality'}) serious = {"Adverse Environmental Effects": 0, "Fatality": 0, "Serious Injury (CER or TSB)": 0} for type_list in df['Incident Types']: type_list = [x.strip() for x in type_list.split(",")] for t in type_list: if t in serious: serious[t] = serious[t] + 1 return serious def thisCompanyPct(df, df_c): pct = {} countPct = (len(df_c.index)/len(df.index))*100 if countPct >= 1: countPct = round(countPct, 0) else: countPct = round(countPct, 1) pct['count'] = countPct return pct # filter to specific company df_c = df[df['Company'] == company].copy() meta = {} # meta['relativePct'] = thisCompanyPct(df, df_c) meta['companyName'] = company meta['seriousEvents'] = other_types(df_c, lang) meta['release'] = int(df_c['Approximate Volume Released'].notnull().sum()) meta['nonRelease'] = int(df_c['Approximate Volume Released'].isna().sum()) meta = most_common(df_c, meta, "What Happened", "mostCommonWhat") meta = most_common(df_c, meta, "Why It Happened", "mostCommonWhy") meta = most_common_substance(df_c, meta, lang) return meta
def most_common_substance(df, meta, lang): if lang == 'en': df_substance = df[df['Substance'] != "Not Applicable"].copy() else: df_substance = df[df['Substance'] != "Sans object"].copy() meta = most_common(df_substance, meta, "Substance", "mostCommonSubstance") return meta
def id3(data): if all(x['analito'] == 'glicose' for x in data): return Leaf('glicose') if all(x['analito'] == 'triglicerideo' for x in data): return Leaf('triglicerideo') if len([k for k in data[0].keys() if k != 'analito']) <= 0: return Leaf(most_common([d['analito'] for d in data])) # Entropia dos atributos splited = {k: entropy([x[k] for x in data.copy()]) for k in data[0].keys()} splited.pop('analito') # Atributo separador separator = min(splited.items(), key=lambda x: x[1])[0] node = Node(separator) if is_continuous(data[0][separator]): for x in data: x[separator] = round(x[separator], PRECISION) for v in set(x[separator] for x in data): aux = [x for x in data.copy() if separator in x and x[separator] == v] for i in aux: i.pop(separator) node.add_son({v: id3(aux)}) return node
def predict_track(clf, track): predicted = predict_tracks(clf, [track, ]) track['sample_predictions'] = predicted prediction, predictions = util.most_common(predicted) track['prediction'] = prediction track['predictions'] = prediction return prediction, predictions
def cluster_n_label(cluster_count, seed_count, data, target): kmeans = sklearn.cluster.KMeans(n_clusters=cluster_count) kmeans.fit(data) total_score = 0 avg_acc = None seed_numbers = util.seed_numbers(len(data), seed_count) # print('seed_numbers:', seed_numbers) clusters_labels = [[] for i in range(cluster_count)] seed_labels = [] for seed_number in seed_numbers: seed_label = target[seed_number] seed_data = data[seed_number] cluster = kmeans.predict([seed_data]) clusters_labels[cluster].append(seed_label) seed_labels.append(seed_label) # print(seed_labels) # using a consensus from inside the same cluster major_labels = map( lambda cluster: util.most_common(cluster)[0] if len(cluster) > 0 else util.most_common(seed_labels)[0], clusters_labels, ) major_labels = list(major_labels) # print('major:', major_labels) score = 0 ssl_label = [] for i, each in enumerate(data): prediction = kmeans.predict([each])[0] predicted_label = major_labels[prediction] ssl_label.append(predicted_label) # print('cluster:', prediction, 'label:', predicted_label, 'actual:', test[i][0]) if target[i] == predicted_label: score += 1 # print('score:', score, 'total:', len(data), 'acc:', (score / len(data)) * 100) return data, ssl_label
def get_complain_most_company(): lista = list() for rec in Reclamacao.objects: lista.append(rec.companhia) l = most_common(lista) return jsonify({ 'empresa': l, })
def get_complain_most_locale(): lista = list() for rec in Reclamacao.objects: lista.append(rec.local) l = most_common(lista) return jsonify({ 'local': l, })
def associate_measurements(self, z, C, H, threshold=None): R_inflation = [2.5, 2.5] asso_mat = np.zeros((self.N, len(z))) for i in range(self.N): if i == 0: asso_mat[i] = associate_measurements(z, self.ensemble[i], self.R, C, H, threshold=threshold, debug=False) else: asso_mat[i] = associate_measurements(z, self.ensemble[i], self.R, C, H, threshold=threshold) return [most_common(list(asso_mat[:, i])) for i in range(len(z))]
def multiPredict(self, X, Y, k): s2 = [] predicitonTopK = self.dfResult.iloc[0:k + 1, 0] for i in range(len(Y)): temp = [] for j in range(k): temp.append(predicitonTopK[j][i]) s2.append(util.most_common(temp)) self.prediction = np.asarray(s2)
def incidentMetaData(df, dfPerKm, company): def most_common_substance(df, meta): df_substance = df[df['Substance'] != "Not Applicable"].copy() meta = most_common(df_substance, meta, "Substance", "mostCommonSubstance") return meta def other_types(df): serious = {"Adverse Environmental Effects": 0, "Fatality": 0, "Serious Injury (CER or TSB)": 0} for type_list in df['Incident Types']: type_list = [x.strip() for x in type_list.split(",")] for t in type_list: if t in serious: serious[t] = serious[t] + 1 return serious def thisCompanyPct(df, df_c): pct = {} countPct = (len(df_c.index)/len(df.index))*100 if countPct >= 1: countPct = round(countPct, 0) else: countPct = round(countPct, 1) pct['count'] = countPct return pct # filter to specific company df_c = df[df['Company'] == company].copy() meta = {} meta['companyName'] = company meta['seriousEvents'] = other_types(df_c) meta['release'] = int(df_c['Approximate Volume Released'].notnull().sum()) meta['nonRelease'] = int(df_c['Approximate Volume Released'].isna().sum()) meta = most_common(df_c, meta, "what common", "mostCommonWhat") meta = most_common(df_c, meta, "why common", "mostCommonWhy") meta["mostCommonWhat"] = [x.strip() for x in meta["mostCommonWhat"].split(" & ")] meta["mostCommonWhy"] = [x.strip() for x in meta["mostCommonWhy"].split(" & ")] meta = most_common_substance(df_c, meta) return meta
def grade_de_reconhecimento(self, lista_de_teste): desenhar = {} # Reconhece a entrada for i in lista_de_teste: z = self.reconhece2(i[0]) try: desenhar[z[0]].append(i[1]) except: desenhar[z[0]] = [i[1]] # Reconhece o mais frequente for i in desenhar: desenhar[i] = most_common(desenhar[i]) self.rec_grid = desenhar
def strategy1(self, poll_from=0): # find most likely answer try: answers = [ method(self) for method in TriviaQuestion.CHOSEN_METHODS[poll_from:] ] print(answers) answers = [ answer for answer in answers if answer != UNCERTAIN_ANSWER ] # print("Polled answers: ", list(answers)) return util.most_common(answers) except: # All answers were -1, randomly guess its 0 return 0
def kfold(tracks, feature_names, folds=5, shuffle=True, **kwargs): labels = [track['label'] for track in tracks] kf = cross_validation.StratifiedKFold(labels, n_folds=folds, shuffle=shuffle) for train, test in kf: train_tracks = [tracks[i] for i in train] test_tracks = [tracks[i] for i in test] clf = machine_learning.Classifier(**kwargs) clf = machine_learning.train_tracks(clf, train_tracks, feature_names) predicted_all = [] Y_test_all = [] for track in test_tracks: X_test, Y_test = machine_learning.shape_features([track], feature_names) predicted = machine_learning.predict(X_test, clf) track['sample_predictions'] = predicted track['prediction'], track['predictions'] = util.most_common(predicted) predicted_all.extend(predicted) Y_test_all.extend(Y_test) yield test_tracks
def kfold(tracks, feature_names, folds=5, shuffle=True, **kwargs): labels = [track['label'] for track in tracks] kf = cross_validation.StratifiedKFold(labels, n_folds=folds, shuffle=shuffle) for train, test in kf: train_tracks = [tracks[i] for i in train] test_tracks = [tracks[i] for i in test] clf = machine_learning.Classifier(**kwargs) clf = machine_learning.train_tracks(clf, train_tracks, feature_names) predicted_all = [] Y_test_all = [] for track in test_tracks: X_test, Y_test = machine_learning.shape_features([track], feature_names) predicted = machine_learning.predict(X_test, clf) track['sample_predictions'] = predicted track['prediction'], track['predictions'] = util.most_common( predicted) predicted_all.extend(predicted) Y_test_all.extend(Y_test) yield test_tracks
def testMostCommonTie(self): meta = {} meta = most_common(self.df, meta, "row_3", "testTie", top=1) self.assertEqual(meta["testTie"], "4 & 8")
def testMostCommonText2(self): meta = {} meta = most_common(self.df, meta, "row_2", "testTop2", top=2) self.assertEqual(meta["testTop2"], {'e': 3, 'c': 2})
def testMostCommonNumber1(self): meta = {} meta = most_common(self.df, meta, "row_1", "testTop1", top=1) self.assertEqual(meta["testTop1"], "0")
def testMostCommonText1(self): meta = {} meta = most_common(self.df, meta, "row_2", "testTop1", top=1) self.assertEqual(meta["testTop1"], "e")
def most_common_substance(df, meta): df_substance = df[df['Substance'] != "Not Applicable"].copy() meta = most_common(df_substance, meta, "Substance", "mostCommonSubstance") return meta
def knn(k, training_data, test_element): distances = [(i, distance(i, test_element)) for i in training_data] nearest = sorted(distances, key=lambda x: x[1])[:k] return most_common([x[0]['analito'] for x in nearest])
def metadata(df, company, test): def filter_near(city): if len(city) <= 2: return False else: return True this_company_meta = {} this_company_meta["totalEvents"] = int(len(list(set(df['Event Number'])))) this_company_meta["totalDigs"] = int(df['Dig Count'].sum()) # nearby in the last year df['Nearest Populated Centre'] = [ str(x) for x in df['Nearest Populated Centre'] ] df['Nearest Populated Centre'] = [ x.split(",")[0].strip() for x in df['Nearest Populated Centre'] ] df_near = df.copy() filter_list = [ 'various', 'various locations as per the attached documents.', 'as per the attached documents', 'as per the attached document', 'business' ] df_near = df_near[~df_near['Nearest Populated Centre'].str.lower(). isin(filter_list)] if not test: last_full_year = datetime.today().year - 1 else: last_full_year = 2020 df_near = df_near[df_near['Commencement Date'].dt.year == last_full_year] if not df_near.empty: # deal with mnp city = [] for city_string in df_near["Nearest Populated Centre"]: if "The project is located" in city_string: city.append(city_string.split("of")[-1].strip()) else: city.append(city_string) df_near["Nearest Populated Centre"] = city for split_char in [",", "("]: df_near['Nearest Populated Centre'] = [ x.split(split_char)[0].strip() for x in df_near['Nearest Populated Centre'] ] near_list = list(df_near['Nearest Populated Centre'] + " " + df_near['Province/Territory'].str.upper()) near_list = [x.replace("Jasper BC", "Jasper AB") for x in near_list] near_list = filter(filter_near, near_list) df_near = pd.DataFrame(near_list) most_common(df_near, this_company_meta, 0, "nearby", 3, "list", False, False) this_company_meta["nearbyYear"] = last_full_year else: this_company_meta["nearby"] = None this_company_meta["atRisk"] = sum( [1 if x == "y" else 0 for x in df['Species At Risk Present']]) # RLG didnt want the land statistic # new_land = df['New Land Area Needed'].sum() # this_company_meta["landRequired"] = int(new_land) # this_company_meta["iceRinks"] = int(round((new_land*2.471)/0.375, 0)) this_company_meta["company"] = company return this_company_meta