def main(): csv = pd.read_csv('city_aqi.csv') print(csv[['City', 'AQI']].sort_values('AQI').head(4)) print(csv.info()) print(csv.head()) print('AQI最大值:', csv['AQI'].max()) print('AQI最小值:', csv['AQI'].min()) print('AQI均值:', csv['AQI'].mean()) top5_cities = csv.sort_values(by=['AQI']).head(5) print('空气质量最好的5个城市:') print(top5_cities) top5_bottom_cities = csv.sort_values(by=['AQI'], ascending=False).head(5) print('空气质量最差的5个城市:') print(top5_bottom_cities) print(csv[csv['AQI'] > 40]) top5_cities.plot(kind='bar', x='City', y='AQI', title='空气质量最好的5个城市', figsize=(10, 10)) plt.savefig('top5_aqi.png') plt.show()
def __init__(self, file, mono=True, cap_train=None, sort=False): if sort: csv = pd.read_csv(file) empty_counts = csv['puzzle'].apply(lambda p: p.count('0')) csv['num_empty'] = empty_counts csv.sort_values(by="num_empty", inplace=True) if cap_train is not None: csv = csv.head(cap_train) self.data = csv else: self.data = pd.read_csv(file, nrows=cap_train) self.mono = mono self.cap_train = cap_train self.edges = sudoku_edges()
def read_tasks(self, mapper: DataMapper): self.data_mapper = mapper csv = pd.read_csv(self.filename, delimiter="\t") # 1 means to operate on the rows, not the columns. csv["Time Remaining"] = csv.apply(self.data_mapper.map_submission, axis=1) csv["Adjusted Priority"] = csv.apply( self.data_mapper.map_adjusted_priority, axis=1) csv = csv.sort_values(by=["Adjusted Priority"]) csv["Sprint"] = csv["Estimate"].apply(self.data_mapper.map_sprints) self.data_mapper.sprint_manager.reset() csv["Confidence"] = csv.apply(self.data_mapper.map_sprint_confidence, axis=1) csv["On track"] = csv["Confidence"].apply( self.data_mapper.map_on_track()) csv["Submitted"] = csv.apply(self.data_mapper.map_human_time_submitted, axis=1) csv["Due"] = csv.apply(self.data_mapper.map_human_time_due, axis=1) return csv
# In[21]: def infra_density(num_o_houses, mask): area = (np.count_nonzero(mask)) * 10 den = num_o_houses / area return den # In[ ]: # In[11]: csv = pd.read_csv(csv_path) csv.head(10) x = csv.sort_values(by=['Village Name', 'Census 2011 ID']) x.head(10) # In[12]: status = x.drop([23528], axis=0) status.head(10) status['Electrified'][0] # In[74]: index = 0 village_name = list() ndvi = list() evi_log = list()
def sortSeg(id): print("sorting %d" % id) csv = pd.read_csv('../result/segg/seg%d.csv' % id) csv.sort_values('times', inplace=True, ascending=False) csv.to_csv("../result/seg/%d.csv" % id, index=False)
"slug": current["slug"], "name": current["name"], "creator_pseudo": current["creator"]["pseudo"], "categories": "|".join([e["name"] for e in current["categories"]]), "youtube_url": extract_url(current["links"], "youtube"), "twitter_url": extract_url(current["links"], "twitter"), "tip_amount": int(current["parameters"]["tipperAmount"]), "tip_number": int(current["parameters"]["tipperNumber"]), }) with open(FILENAME, "a") as f: writer = csv.DictWriter(f, data[0].keys(), lineterminator="\n") if f.tell() == 0: writer.writeheader() writer.writerows(data) csv = pd.read_csv(FILENAME, parse_dates=["date"]) csv.drop_duplicates(subset=["date", "slug"], keep="last", inplace=True) csv.sort_values(by=["date", "slug"], inplace=True) csv.to_csv(FILENAME, index=False)
def clustering(file_name,times_thr, path_thr,n_clusters) : """ This function return a dictionary, which contains a cluster (as a key) and the associated plates (as values of that key) - **parameters**, **types**, **return** and **return types**:: :param file_name: file name :param times_thr: threshold of times, it is going to select greater values :param path_thr: threshold of the lenght of the paths, it is going to select greater values :param n_cluster: number of clusters for Kmeans algorithm :type file_name: string :type times_thr: int :type path_thr: int :type n_cluster: int :return: return a dictionary, which contains a cluster (as a key) and the associated plates (as values of that key) :rtype: dictionary (int, [int,..]) """ csv= pd.read_csv(file_name, sep=',',index_col=None) csv = csv.loc[csv['volte']>=times_thr] sorted_csv = csv.sort_values('targa') csv_np = sorted_csv.values plates = sorted_csv['targa'].unique() lista = list() total_paths = np.array([]) for count,plate in enumerate(plates) : paths =csv_np[np.where(csv_np[:,0] == plate),1][0] volte =csv_np[np.where(csv_np[:,0] == plate),2][0] lista.insert(count, [plate]) i=0 while i < len(paths) : if len(paths[i].split('-')) <= path_thr : i+=1 continue lista[count].append(paths[i]) total_paths= np.append(total_paths,paths[i]) lista[count].append(volte[i]) i+=1 total_paths = np.unique(total_paths) plates = np.array([],dtype=int) lista = [x for x in lista if len(x)>1] for x in lista : plates = np.append(plates,x[0]) data = np.zeros(shape=(len(lista),len(total_paths)), dtype = np.int8) print "selected plates: "+str(plates)+"\nselected paths: " + str(total_paths)+"\nmatrix shape: " + str(data.shape) for count,element in enumerate(lista) : i=1 while i < len(element) : index = total_paths.tolist().index(element[i]) data[count,index] = element[i+1] i+=2 print data clusterer= KMeans(n_clusters=n_clusters, random_state=10) kmeans = clusterer.fit(data) cluster_labels = clusterer.fit_predict(data) silhouette_avg = silhouette_score(data, cluster_labels) print "For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_avg sample_silhouette_values = silhouette_samples(data, cluster_labels) clusters_map = {} for cluster in kmeans.labels_ : clusters_map[cluster] = [] for i,plate in enumerate(lista) : clusters_map[kmeans.labels_[i]].append(plate[0]) print "\ncluster\tplates" for k,v in clusters_map.items(): print k,"\t",v fig, (ax1,ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(data[:, 0], data[:, 1], marker='.', s=30, lw=0, alpha=0.7,c=colors, edgecolor='k') centers = clusterer.cluster_centers_ ax2.scatter(centers[:, 0], centers[:, 1], marker='o',c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') plt.suptitle(("KMeans clustering on sample data " "with n_clusters = %d\nThe average silhouette_score is %0.4f" % (n_clusters,silhouette_avg)), fontsize=14, fontweight='bold') ax1.bar(clusters_map.keys(),[len(clusters_map[x]) for x in clusters_map.keys() ],color='r') ax1.set_ylabel("number of cars") ax1.set_xlabel("clusters") ax1.set_title("K-Means clustering") plt.show()