def create_train_data(self): i = 0 imgdatas = [] imglabels = [] imgs = glob.glob( self.data_path + "/*" + self.img_type) #get a list of images' full names(including path) for imgname in imgs: midname = imgname[imgname.rindex("/") + 1:-4] #get image name # img=img_to_array(load_img(self.data_path+"/"+midname)) img, img_h, img_w = util.get_image(self.data_path + "/" + midname + self.img_type) # label=img_to_array(load_img(self.label_path+"/"+midname)) label, y_h, y_w = util.get_label(self.label_path + "/" + midname + self.annot_img_type) imgdatas.append(img) imglabels.append(label) if i % 100 == 0: print('Done:{0}/{1} images'.format(i, len(imgs))) i += 1 imgdatas = np.array(imgdatas, dtype=np.uint8) imglabels = np.array(imglabels, dtype=np.uint8) print("loading done") np.save(self.npy_path + '/imgs_train.npy', imgdatas) np.save(self.npy_path + '/imgs_mask_train.npy', imglabels) print('Saving to npy files done.')
def __init__(self, parent=None): super().__init__(parent) label_path = "./labels/imagenet_labels.pkl" self.labels = get_label(label_path) self.ui = uic.loadUi("./ui/Class_View.ui", self) self.cls = -1 self.initUI()
def load_data(self, fname): """ load data from local file """ facts = [] accu_label = [] article_label = [] imprison_label = [] with open(fname, 'r') as f: line = f.readline() while line: line_dict = json.loads(line, encoding="utf-8") fact = line_dict["fact"] accu = util.get_label(line_dict, "accu") article = util.get_label(line_dict, "law") imprison = util.get_label(line_dict, "time") facts.append(fact) accu_label.append(accu) article_label.append(article) imprison_label.append(imprison) line = f.readline() if util.DEBUG: print("training file loaded.") facts = self.cut_all(facts) if util.DEBUG: print("training data segmented.") accu_label = pd.Series(accu_label) article_label = pd.Series(article_label) imprison_label = pd.Series(imprison_label) if util.DUMP: self.dump_processed_data_to_file(facts, accu_label, article_label, imprison_label) return facts, accu_label, article_label, imprison_label
set2_df = get_df('data/set2.json') X_set1 = list() Y_set1 = list() X_set2 = list() Y_set2 = list() # prune, lemmatize and prepare text for i in range(len(set1_df)): denote = ' '.join(set1_df.loc[i]['denotations']) string = ' '.join(preprocess_text(set1_df.loc[i]['text'])) X_set1.append(set1_df.iloc[i]['sourceid'] + string + ' ' + denote) #Encode labels for cross verification le = LabelEncoder() Y_set1 = le.fit_transform(get_label()[1]) label_set1 = set1_df['sourceid'] #TF-IDF vectorizer = TfidfVectorizer(stop_words='english') hash_matrix = vectorizer.fit_transform(X_set1) # CLUSTER # Split data into 6 clusters because data analysis shows an elbow at 6 and we also know that there are 6 labels kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=1000).fit(hash_matrix) # get Cluster labels. clusters = kmeans.labels_ print("Generated diseases\n\n") for group in set(clusters):