def load_data(self): self.features, self.labels, self.idx_train, self.idx_val, self.idx_test \ = feature_reader(dataset=self.dataset, scale=self.args.scale, train_ratio=self.args.train_ratio, feature_size=self.args.feature_size) # print('feature_size', self.features.shape) self.n_nodes = len(self.labels) self.n_features = self.features.shape[1] self.n_classes = self.labels.max().item() + 1 self.edges = graph_reader(dataset=self.dataset) self.labeler = Labeler(self.features, self.labels, self.n_classes, self.idx_train, self.idx_val, self.idx_test) if self.mode in ('clusteradj', 'clusteradj-clean'): self.generate_fake_labels() if self.args.break_down: self.break_down() self.adj = self.build_cluster_adj() self.prj = self.build_cluster_prj() else: self.adj = self.build_adj_mat(mode=self.mode) # self.calculate_connectivity() if torch.cuda.is_available(): self.features = self.features.cuda() self.adj = self.adj.cuda() self.labels = self.labels.cuda() if hasattr(self, 'prj'): self.prj = self.prj.cuda()
class Predicter(BernoulliNB): # accept training data set as 2-column DataFrame def __init__(self): super().__init__() # give dictionary to the labeler to be initiated def init_labeler(self, dictionary): self.labeler = Labeler(dictionary) # now that labeler can vectorize our sentences # we are ready to train our model # df[0] -> sentences # df[1] -> scores def train(self, sentence_list, labels): feature_vector = (self.labeler.label_sentence_list(sentence_list)) super().fit(feature_vector, labels) def test(self, sentences, real_values): # sentences -> pandas Series of sentences test_vector = self.labeler.label_sentence_list(sentences) test_results = super().predict(test_vector) # analyze the results real_pred = zip(real_values, test_results) correct_lbl = sum(list(map(lambda x:1 if x[0]==x[1] else 0, real_pred))) return correct_lbl *1.0/ len(real_values) def predict_sentence(self, sentence): s = [sentence] vector = self.labeler.label_sentence_list(s) return super().predict(vector)[0]
def __init__(self, mode="rb"): if mode == "wb": self.__labeler = Labeler(mode) self.__model = None global graph if os.path.isfile(MODEL_LOCATION): self.__model = load_model(MODEL_LOCATION) graph = tf.get_default_graph() else: print("Could not init TLClassifier!")
def load_and_label_training(): for directory in os.listdir(train_dir): current_dir = os.path.join(train_dir, directory) if os.path.isdir(current_dir): for filename in os.listdir(current_dir): f = os.path.join(current_dir, filename) if os.path.isfile(f): for label in pa.answers: if directory.__contains__(label): training_images.append( Labeler(filename, load_image(f), label))
def __init__(self, cp, snaptype): self.cp = cp.getSection('snapshot.'+snaptype) self.labeler = Labeler(cp) self.cmprs = [] for col,cmpn in json.loads(self.cp("comparators")): cmpsn = "comparator.%s"%cmpn self.cmprs.append( Comparator(col-1, cp.getSection(cmpsn), self.labeler) ) self.pubDate = time.strftime(self.cp('dateFormat')) self.typ = self.cp('type') self.stype = self.cp('stype') self.sformat = self.cp('sformat',True)
class Snapshot: def __init__(self, cp, snaptype): self.cp = cp.getSection('snapshot.'+snaptype) self.labeler = Labeler(cp) self.cmprs = [] for col,cmpn in json.loads(self.cp("comparators")): cmpsn = "comparator.%s"%cmpn self.cmprs.append( Comparator(col-1, cp.getSection(cmpsn), self.labeler) ) self.pubDate = time.strftime(self.cp('dateFormat')) self.typ = self.cp('type') self.stype = self.cp('stype') self.sformat = self.cp('sformat',True) def convertZeros(self, r): for i in range(len(r)): if r[i] == '0': r[i] = '' return r # Compares two files, f1 and f2, and reports any differences # Written as an iterator that yields a series of dicts. # Each one is a difference record. def diffs(self, f1, f2): # foreach pair of records for idVal, r1, r2 in mergeIter(f1, f2, all=True): r1 = r1 and self.convertZeros(r1) or None r2 = r2 and self.convertZeros(r2) or None # try each comparison for cmpr in self.cmprs: try: # report any diffs for d in cmpr.diffs(r1, r2): d['id'] = idVal d['type'] = self.typ if d.get('subject',None) is None: d['subject'] = self.labeler.get(self.stype, idVal, self.sformat) if d['subject'] is None: logging.warn("No label found for: "+idVal) d['subject'] = '???' d['label'] = xmlEscape(d['subject']) + ' [' + idVal + ']' d['updateMessage'] = xmlEscape(d['updateMessage'] % d) d['pubDate'] = self.pubDate yield d except: print "ERROR!" print "comparator=",str(cmpr) print "r1=",r1 print "r2=",r2 raise
from statistics import mean, variance from tqdm import tqdm from PIL import Image from shared import directory_contents from labeler import Labeler LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.DEBUG) INPUTS = f"{PROJECT_ROOT}/outputs" OUTPUTS = f"{PROJECT_ROOT}/outputs/patches" LABELS = pd.read_csv(f"{INPUTS}/labels.csv") LABELER = Labeler(PROJECT_ROOT) KEEP_TOP_N = 10 def delete_patch(fname): subprocess.Popen(["rm", fname]) def delete_patches_using_labels(): for subdir in tqdm(directory_contents(INPUTS)): for fname in tqdm(directory_contents(subdir)): label = LABELER.labels(fname, top_n_labels=1)[0] if label in LABELS.label.unique(): if LABELS.loc[LABELS.label == label, "action"] == "delete": delete_patch(fname)
def init_labeler(self, dictionary): self.labeler = Labeler(dictionary)
if __name__ == "__main__": posts_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\anon.contributions.csv" path_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus.pkl" path_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus_embeddings.pkl" label_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\Labeler.pkl" data_loader = DataLoader() data_loader.load(posts_path) qs, followup_qs = data_loader.questions_in_folder("", index=True) as2, followup_as2 = data_loader.questions_in_folder("assignment2", index=True) bert_s_s = BertSemanticSearch().from_files(path_corpus, path_corpus_embeddings) # label dataset labeler = Labeler(label_path) for i in range(len(as2)): idx, text = as2[i] choices_idx = bert_s_s.single_semantic_search(text, 10) labeler.label( text=text, text_idx=idx, choices=[qs[int(choice_idx)][1] for choice_idx in choices_idx], choices_idx=[qs[int(choice_idx)][0] for choice_idx in choices_idx] ) print(labeler.labels) labeler.save()
class TLClassifier(object): def __init__(self, mode="rb"): if mode == "wb": self.__labeler = Labeler(mode) self.__model = None global graph if os.path.isfile(MODEL_LOCATION): self.__model = load_model(MODEL_LOCATION) graph = tf.get_default_graph() else: print("Could not init TLClassifier!") @property def model(self): assert self.__model is not None return self.__model def save_image(self, image, label): cv2.imwrite("test.png", image) return self.__labeler.label_image(image, label) def get_classification(self, image): """Determines the color of the traffic light in the image Args: image (cv::Mat): image containing the traffic light Returns: int: ID of traffic light color (specified in styx_msgs/TrafficLight) uint8 UNKNOWN=4 uint8 GREEN=2 uint8 YELLOW=1 uint8 RED=0 """ # TODO implement light color prediction if self.__model is None: return TrafficLight.UNKNOWN image_array = np.array(resize_image(image)) global graph with graph.as_default(): traffic_light = int( self.model.predict(image_array[None, :, :, :], batch_size=1)) prob = self.model.predict_proba(image_array[None, :, :, :], batch_size=1) if prob < 0.5: print("Using hough due to low probability: " + str(prob)) return self.__hough_stop_light_detector(image_array) if traffic_light == 0: return TrafficLight.RED elif traffic_light == 1: return TrafficLight.YELLOW elif traffic_light == 2: return TrafficLight.GREEN return TrafficLight.UNKNOWN def __hough_stop_light_detector(self, img): gray = np.array(img)[:, :, 2] cv2.medianBlur(gray, 7) circles = cv2.HoughCircles( gray, cv2.HOUGH_GRADIENT, dp=1.0, minDist=5, param1=100, param2=15, minRadius=3, maxRadius=10, ) if circles is not None: circles = np.uint16(np.around(circles)) center_dots = [] for i in circles[0, :]: # draw the outer circle cv2.circle(img, (i[0], i[1]), i[2], (0, 255, 0), 2) # draw the center of the circle cv2.circle(img, (i[0], i[1]), 2, (0, 0, 255), 3) center_dots.append(img[i[1], i[0]]) median = np.median(center_dots, axis=0) is_red = median[0] < 10 and median[1] < 10 and median[2] > 200 is_green = median[0] < 10 and median[1] > 200 and median[2] < 10 if is_red: return TrafficLight.RED elif is_green: return TrafficLight.GREEN # TODO: orange case return TrafficLight.UNKNOWN
cols = 2 fig, axes = plt.subplots(rows, cols, figsize=figsize) plt.subplots_adjust(bottom=0.07, top=0.95, left=0.12, right=0.98, hspace=0.2, wspace=0.5) # Turn axes off on upper left corner plots for i in range(2): for j in range(2): plt.setp(axes[i, j], frame_on=False, xticks=[], yticks=[]) # Make a labler to add labels to subplots labeler = Labeler(xpad=.07, ypad=0.0, fontsize=10) # Label upper left corner #ax = axes[0,0] ax = plt.subplot(451) labeler.label_subplot(ax, 'A') plt.setp(ax, frame_on=False, xticks=[], yticks=[]) ax = plt.subplot(453) labeler.label_subplot(ax, 'B') plt.setp(ax, frame_on=False, xticks=[], yticks=[]) ax = plt.subplot(455) labeler.label_subplot(ax, 'C') plt.setp(ax, frame_on=False, xticks=[], yticks=[])
def generator(samples, batch_sz=32): num_samples = len(samples) while 1: # Loop forever so the generator never terminates shuffle(samples) for offset in range(0, num_samples, batch_sz): batch_samples = samples[offset : offset + batch_sz] tmp_features, tmp_labels = load_data(batch_samples) yield shuffle(tmp_features, tmp_labels) # ======== MAIN ======== feature_shape = [150, 200, 3] labeler = Labeler("rb") data = labeler.load() data["features"] = data["features"].reshape([-1] + feature_shape) print("Labels: " + str(data["labels"].size)) print("Features: " + str(data["features"].size)) print("Shape: " + str(data["features"].shape)) model = None if os.path.isfile("model.h5"): model = load_model("model.h5") print("+++ TRANSFER LEARNING +++") elif feature_shape is not None: model = Sequential() model.add(Lambda(lambda x: x / 255.0 - 0.5, input_shape=feature_shape)) # tmp_model.add(Cropping2D(cropping=((70, 25), (0, 0))))
#fig, axes = plt.subplots(rows,cols,figsize=figsize) bottom = 0.15 top = 0.95 width = 0.7 height = top - bottom pad = 0.1 fig, axes = plt.subplots(rows, cols, figsize=figsize) plt.subplots_adjust(bottom=0.1, top=0.95, left=0.1, right=0.9, wspace=0.6, hspace=0.4) labeler = Labeler(xpad=.08, ypad=.01, fontsize=10) ax = axes[0, 0] labeler.label_subplot(ax, 'A') lims = [-6, -2] plot_fraction_compare(rep1, rep3, ax) ax.plot(lims, lims, '--', c='k', zorder=10) ax.set_ylabel('fraction of population\nreplicate 1', labelpad=2) ax.set_xlabel('fraction of population\nreplicate 3', labelpad=2) ticks = range(lims[0], lims[1] + 1) tick_labels = [r'$10^{' + str(t) + '}$' for t in ticks] ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(tick_labels) ax.set_yticklabels(tick_labels)
plt.subplots_adjust( #top=.55, #bottom=.05, top=0.98, bottom=0.38, left=.12, right=.95, hspace=0, wspace=.5) # Tite-Seq vs Flow # Panel C # Make a labler to add labels to subplots labeler = Labeler(xpad=.07, ypad=-.01, fontsize=10) # Position panel #bottom=0.62 #top=0.98 bottom = 0.05 top = 0.30 left = 0.30 right = 0.75 height = top - bottom width = right - left ax = fig.add_axes([left, bottom, width, height]) labeler.label_subplot(ax, 'C', xpad_adjust=.05, ypad_adjust=0) log_bounds = [-10, -4.5] lims = log_bounds
posts_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\anon.contributions.csv" dupe_check_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\dupe_check.pkl" label_path = r"C:\Users\karlc\Documents\ut\_y4\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\dupe_check_labels.pkl" data_loader = DataLoader() data_loader.load(posts_path) # map question indices to their text qs, followup_qs = data_loader.questions_in_folder("", index=True) qs = {q[0]: q[1] for q in qs} # load piazza's pred dupe_check = load_pickle(dupe_check_path) # label dataset labeler = Labeler(label_path) # # randomly select 100 # indices = random.sample([i for i in range(len(dupe_check))], 100) # dupe_check = [dupe_check[i] for i in indices] for curr in dupe_check: idx = curr[0] text = qs[idx] labeler.label( text=text, text_idx=idx, choices=[qs[qidx] for qidx in curr[1:]], choices_idx=curr[1:] )
plt.close('all') # Create figure with subplots and specified spacing figsize = (6, 7) rows = 10 cols = 4 fig, axes = plt.subplots(rows, cols, figsize=figsize) plt.subplots_adjust(top=.98, bottom=.05, left=.05, right=.95, hspace=0, wspace=.5) # Make a labler to add labels to subplots labeler = Labeler(xpad=.03, ypad=-.01, fontsize=10) # fluorescein grid summary = get_clone_data() clones = summary.keys() inds = np.argsort( [np.nanmean(np.log10(np.array(summary[k]['KD']))) for k in clones]) # Panel B labeler.label_subplot(axes[0, 0], 'A') fl = np.array([ 0, 10**-9.5, 10**-9, 10**-8.5, 10**-8, 10**-7.5, 10**-7, 10**-6.5, 10**-6, 10**-5.5, 10**-5 ])
from downloader import Downloader from labeler import Labeler if __name__ == "__main__": downloader = Downloader("apple", "data", img_count=5) downloader.download() labeler = Labeler("./data", ["apple", "not apple"], dataset_dir="def_not_data") labeler.label()
import logging from flask import Flask from flask_ask import Ask, question, session, context, version from labeler import Labeler app = Flask(__name__) ask = Ask(app, "/") labeler = Labeler() # log = logging.getLogger("flask_ask").setLevel(logging.DEBUG) logging.basicConfig(level=logging.INFO) @ask.launch def new_game(): logging.info("Session New?: {}".format(session.new)) logging.info("User ID: {}".format(session.user.userId)) logging.info("Alexa Version: {}".format(version)) logging.info("Device ID: {}".format(context.System.device.deviceId)) # logging.info("Device: {}".format(context.System.device.keys())) logging.info("System: {}".format(context.System)) print("User: {}".format(context.System.user)) return labeler.get_intro_statement() @ask.intent("GlobalIntent", convert={"item": str})
n_trials = len(crp_ratios_nbr_mat) nst_pval = stats.binom_test(n_success, n_trials) print 'Nonparametric sign test for nbr > mat: P = %f' % nst_pval # # Make figure # width = 160 height = 175 bottom = 5 fig = plt.figure(figsize=(mm2inch(width), mm2inch(height + bottom))) sns.set(font_scale=0.8) # Make a labler to add labels to subplots labeler = Labeler(xpad=.07, ypad=0.02, fontsize=10) left = width_mm2fig(15, fig) stat_left = left middle = width_mm2fig(70, fig) right = width_mm2fig(125, fig) level1 = height_mm2fig(140 + bottom, fig) level2 = height_mm2fig(105 + bottom, fig) level3 = height_mm2fig(60 + bottom, fig) level4 = height_mm2fig(10 + bottom, fig) hm_width = width_mm2fig(160, fig) hm_height = height_mm2fig(20, fig) stat_width = width_mm2fig(30, fig) stat_height = height_mm2fig(30, fig)
# Create figure with subplots and specified spacing figsize = (3.5, 5.6) rows = 14 cols = 1 col = 1 fig, axes = plt.subplots(figsize=figsize) gs = gridspec.GridSpec(28, 2) plt.subplots_adjust(bottom=0.06, top=0.95, left=0.17, right=0.96, wspace=0.6, hspace=0.0) # Make a labler to add labels to subplots labeler = Labeler(xpad=.13, ypad=.01, fontsize=10) # For CDR1H and CDR3H conc_labels = ['$0$'] + \ ['$10^{%1.1f}$'%x for x in np.arange(-9.5,-4.5,0.5)] file_labels = ['0M'] + \ ['10^%1.1fM'%x for x in np.arange(-9.5,-4.5,0.5)] #csv_name = 'out.csv' filenames = get_filenames(directory) names = [re.search('Sort (\d+)', ii) for ii in filenames] condition = [n.group(1) for n in names] # Make plots for [filename, well] in zip(filenames, condition):
labelsize = 8 panelsize = 12 param_lims = [-1,1] param_ticks = [-1,-.5,0,.5,1] # Set colormaps cmap = sns.cubehelix_palette(8, start=0.0, rot=0.0, reverse=True, as_cmap=True) vmax = 100 vmin = 75 sns.set_style('white') # Make a labler to add labels to subplots labeler = Labeler(xpad=.07,ypad=0.02,fontsize=10) ## RNAP heatmap # Plot results for real RNAP data ax = fig.add_axes([left, level1, hm_width, hm_height]) labeler.label_subplot(ax,'A',xpad_adjust=0.03,ypad_adjust=0.04) sns.heatmap( df_rnap_comparison.transpose(), annot=True, fmt="d", vmin=vmin, vmax=vmax, annot_kws={"size": 7}, cmap=cmap, cbar_kws={"pad":.03}) gelx(ax,df_rnap_xannotation,annotation_spacing=0.8,fontsize=labelsize) gely(ax,df_rnap_yannotation,annotation_spacing=0.8,fontsize=labelsize,rotation=0) # Draw white lines (num_cols,num_rows) = df_rnap_comparison.shape for y in range(num_rows):
shutil.copy(f, dst) if __name__ == "__main__": wandb_logger = WandbLogger(project="nnsplit") parser = Network.get_parser() parser.set_defaults(logger=wandb_logger) hparams = parser.parse_args() if hparams.logger: store_code(wandb_logger.experiment) labeler = Labeler([ SpacySentenceTokenizer("de_core_news_sm", lower_start_prob=0.7, remove_end_punct_prob=0.7), SpacyWordTokenizer("de_core_news_sm"), ]) model = Network( MemoryMapDataset("../train_data/texts.txt", "../train_data/slices.pkl"), labeler, hparams, ) n_params = np.sum([np.prod(x.shape) for x in model.parameters()]) trainer = Trainer.from_argparse_args(hparams) print(f"Training model with {n_params} parameters.") trainer.fit(model)
n_trials = len(crp_ratios_nbr_mat ) nst_pval = stats.binom_test(n_success,n_trials) print 'Nonparametric sign test for nbr > mat: P = %f'%nst_pval ''' # # Make figure # width=160 height=175 bottom=5 fig = plt.figure(figsize=(mm2inch(width),mm2inch(height+bottom))) sns.set(font_scale=0.8) # Make a labler to add labels to subplots labeler = Labeler(xpad=.07,ypad=0.02,fontsize=10) left = width_mm2fig(15,fig) stat_left = left middle = width_mm2fig(70,fig) right = width_mm2fig(125,fig) level1 = height_mm2fig(140+bottom,fig) level2 = height_mm2fig(105+bottom,fig) level3 = height_mm2fig(60+bottom,fig) level4 = height_mm2fig(10+bottom,fig) hm_width = width_mm2fig(160,fig) hm_height = height_mm2fig(20,fig) stat_width = width_mm2fig(30,fig) stat_height = height_mm2fig(30,fig)
class ClusterWorker(): def __init__(self, args, dataset='', mode=''): self.args = args self.dataset = dataset self.mode = mode self.load_data() def build_cluster_adj(self, clean=False): """ build a adjacency matrix which only record what kind of fake labels each node link to """ adj = np.zeros((self.n_nodes, self.n_clusters), dtype=np.float64) for dst, src in self.edges.tolist(): adj[src, self.fake_labels[dst]] += 1 adj[dst, self.fake_labels[src]] += 1 if self.mode in ('clusteradj') and not clean: adj += get_noise(self.args.noise_type, self.n_nodes, self.n_clusters, self.args.noise_seed, eps=self.args.epsilon, delta=self.args.delta) adj = np.clip(adj, a_min=0, a_max=None) adj = normalize(adj) return torch.FloatTensor(adj) adj = sp.coo_matrix(adj) adj = normalize(adj) return sparse_mx_to_torch_sparse_tensor(adj) def build_cluster_prj(self): """ :return: a projection matrix, each column has 1 non-zero element, which is the inverse of the number of the class it belongs to. """ unique, count = np.unique(self.fake_labels, return_counts=True) prj = np.zeros((self.n_clusters, self.n_nodes)) for i, label in enumerate(self.fake_labels): prj[label, i] = 1 / count[label] return torch.FloatTensor(prj) def break_down(self): """ generating broken down fake labels """ indice = [[] for i in range(self.n_classes)] for i, label in enumerate(self.fake_labels): indice[label].append(i) unique, count = torch.unique(self.fake_labels, return_counts=True) # print('unique', unique) # print('count', count) min_size = int(torch.min(count).item() * self.args.break_ratio + 0.5) if min_size == 0: min_size = 1 # print('min_size', min_size) split = [self.labeler.get_equal_size(val, min_size) for val in count] # print('split', [elem[0] for elem in split], sum([elem[0] for elem in split])) t0 = time.time() start = 0 # hierarchical clustering for i in range(self.n_classes): idx = indice[ i] # all the indexes in fake_labels whose labels are i if not idx: continue n_clusters, quota = split[(unique == i).nonzero().item()] self.fake_labels[idx] = self.labeler.get_cluster_labels( self.features[idx], n_clusters, quota=quota, start=start, same_size=True) start += n_clusters self.n_clusters = start # the number of class after clustering print('generating broken down fake labels done using {} secs!'.format( time.time() - t0)) # torch.save(self.fake_labels, 'flabels_{}.pt'.format(self.n_clusters)) def build_adj_vanilla(self): adj = np.zeros((self.n_nodes, self.n_nodes), dtype=np.float64) for dst, src in self.edges: adj[src][dst] = adj[dst][src] = 1 t0 = time.time() adj += get_noise(self.args.noise_type, self.n_nodes, self.n_nodes, self.args.noise_seed, eps=self.args.epsilon, delta=self.args.delta) adj = np.clip(adj, a_min=0, a_max=None) print('adding noise done using {} secs!'.format(time.time() - t0)) return adj def build_adj_original(self): adj = sp.coo_matrix((np.ones(self.edges.shape[0]), (self.edges[:, 0], self.edges[:, 1])), shape=(self.n_nodes, self.n_nodes), dtype=np.float32) # build symmetric adjacency matrix adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) return adj def build_adj_mat(self, mode='vanilla-clean'): if mode == 'vanilla-clean': adj = self.build_adj_original() elif mode == 'vanilla': adj = self.build_adj_vanilla() else: raise NotImplementedError( 'mode = {} not implemented!'.format(mode)) adj = normalize(adj + sp.eye(adj.shape[0])) adj = sparse_mx_to_torch_sparse_tensor( adj) if mode == 'vanilla-clean' else torch.FloatTensor(adj) return adj def load_data(self): self.features, self.labels, self.idx_train, self.idx_val, self.idx_test \ = feature_reader(dataset=self.dataset, scale=self.args.scale, train_ratio=self.args.train_ratio, feature_size=self.args.feature_size) # print('feature_size', self.features.shape) self.n_nodes = len(self.labels) self.n_features = self.features.shape[1] self.n_classes = self.labels.max().item() + 1 self.edges = graph_reader(dataset=self.dataset) self.labeler = Labeler(self.features, self.labels, self.n_classes, self.idx_train, self.idx_val, self.idx_test) if self.mode in ('clusteradj', 'clusteradj-clean'): self.generate_fake_labels() if self.args.break_down: self.break_down() self.adj = self.build_cluster_adj() self.prj = self.build_cluster_prj() else: self.adj = self.build_adj_mat(mode=self.mode) # self.calculate_connectivity() if torch.cuda.is_available(): self.features = self.features.cuda() self.adj = self.adj.cuda() self.labels = self.labels.cuda() if hasattr(self, 'prj'): self.prj = self.prj.cuda() def generate_fake_labels(self): cluster_method = self.args.cluster_method t0 = time.time() if cluster_method == 'random': self.n_clusters = self.args.n_clusters self.fake_labels = self.labeler.get_random_labels( self.n_clusters, self.args.cluster_seed) elif cluster_method == 'hierarchical': init_method = self.args.init_method self.n_clusters = self.n_classes if init_method == 'naive': self.fake_labels = self.labeler.get_naive_labels( self.args.assign_seed) elif init_method == 'voting': self.fake_labels = self.labeler.get_majority_labels( self.edges, self.args.assign_seed) elif init_method == 'knn': self.fake_labels = self.labeler.get_knn_labels(self.args.knn) elif init_method == 'gt': self.fake_labels = self.labels.clone() else: raise NotImplementedError( 'init_method={} in cluster_method=label not implemented!'. format(init_method)) elif cluster_method in ('kmeans', 'sskmeans'): self.n_clusters = self.args.n_clusters self.fake_labels = self.labeler.get_kmeans_labels( self.n_clusters, self.args.knn, cluster_method, same_size=self.args.same_size) else: raise NotImplementedError( 'cluster_method={} not implemented!'.format(cluster_method)) print('generating fake labels done using {} secs!'.format(time.time() - t0)) # torch.save(self.fake_labels, 'flabels_{}.pt'.format(self.n_clusters)) def calculate_connectivity(self): n_edges = len(self.edges) kappa = n_edges / (0.5 * self.n_nodes * (self.n_nodes - 1)) labels = self.fake_labels edge_adj = np.zeros((self.n_clusters, self.n_clusters)) for edge in self.edges: u, v = labels[edge[0]], labels[edge[1]] edge_adj[u][v] += 1 edge_adj[v][u] += 1 unique, count = np.unique(labels, return_counts=True) kappa_intra = 0 for i in range(self.n_clusters): kappa_intra += edge_adj[i][i] / (0.5 * count[i] * (count[i] - 1)) kappa_intra /= self.n_clusters kappa_inter = 0 for i in range(self.n_clusters): for j in range(i + 1, self.n_clusters): kappa_inter += edge_adj[i][j] / (count[i] * count[j]) kappa_inter /= (0.5 * self.n_clusters * (self.n_clusters - 1)) print('k_inter = {:4f}, k = {:4f}, k_intra = {:4f}'.format( kappa_inter, kappa, kappa_intra)) logging.info('k_inter = {:4f}, k = {:4f}, k_intra = {:4f}'.format( kappa_inter, kappa, kappa_intra)) def calculate_degree(self): degrees = np.zeros(self.n_nodes) for edge in self.edges: u, v = edge degrees[u] += 1 degrees[v] += 1 return degrees def update_adj(self): if self.mode == 'clusteradj': self.adj = self.build_cluster_adj(clean=True) elif self.mode == 'vanilla': self.adj = self.build_adj_mat(mode='vanilla-clean') if torch.cuda.is_available(): self.adj = self.adj.cuda()
black = [0., 0., 0.] # Create figure with subplots and specified spacing figsize = (3.42, 4.5) rows = 2 cols = 2 fig, axes = plt.subplots(rows, cols, figsize=figsize) plt.subplots_adjust(bottom=0.07, top=0.95, left=0.07, right=0.88, hspace=0.4, wspace=0.6) # Make a labler to add labels to subplots labeler = Labeler(xpad=.035, ypad=.015, fontsize=10) wtseq1 = 'TFSDYWMNWV' seq1pos = np.arange(28, 38) optseq1_dict = {30: 'G', 31: 'H'} wtseq2 = 'GSYYGMDYWG' seq2pos = np.arange(100, 110) optseq2_dict = {101: 'A', 102: 'S', 106: 'E', 108: 'L'} # Get affinity zero A_heatmaps = [] A_wts = [] for rep in all_reps: temp_hm, wt_temp = c_matrix(rep, aff_fun) A_heatmaps.append(temp_hm) A_wts.append(wt_temp)
) parser.add_argument( "--model_path", help="Directory to store the model at.", ) hparams = parser.parse_args() if hparams.logger: store_code(wandb_logger.experiment) labeler = Labeler([ SpacySentenceTokenizer(hparams.spacy_model, lower_start_prob=0.7, remove_end_punct_prob=0.7, punctuation=".?!"), SpacyWordTokenizer(hparams.spacy_model), WhitespaceTokenizer(), # SECOSCompoundTokenizer("../../../Experiments/SECOS/"), # used for german ]) model = Network( MemoryMapDataset(hparams.text_path, hparams.slice_path), labeler, hparams, ) n_params = np.sum([np.prod(x.shape) for x in model.parameters()]) trainer = Trainer.from_argparse_args(hparams) print(f"Training model with {n_params} parameters.") trainer.fit(model)