def melanoma_metastatic_oncogenes(source_files): cli, seq, graph, onco_genes = source_files[:4] print("----- reading raw datasets") cli = read_files(cli, index_col=0) seq = read_files(seq, index_col=0) graph = read_files(graph) onco_genes = read_files(onco_genes) print("----- preprocessing raw datasets") # get primary(0) vs metastatic(1) cli = cli.loc[:, ["sample_type.samples"]] cli = cli.loc[cli.iloc[:, 0].isin(["Primary Tumor", "Metastatic"]), :] cli = cli.replace({"Primary Tumor": 0, "Metastatic": 1}) # seq common_genes = list( set(seq.index).intersection(set(onco_genes.iloc[:, 0]))) seq = seq.loc[common_genes, :].T # remove duplicated, choose the largest var seq = remove_duplicated_columns_byVar(seq).loc[:, common_genes] # graph graph_gene_mask = graph.iloc[:, -2:].isin(common_genes).all(axis=1) graph = graph[graph_gene_mask] # gene names --> number graph = trans_graph_toNumber(common_genes, graph, [1, 2]) # same samples cli, seq = samples_intersection(cli, seq) # to ndarray, return return cli.values, seq.values, graph.values
def run_test(model, train, test): train = utils.read_files(train) test = utils.read_files(test) distr = model.distribution(train) print("-Train:-") model.neg_log_prob(train, distr) model.predict(train, distr) print("\n") print("-Test:-") model.neg_log_prob(test, distr) model.predict(test, distr) print("\n")
def run_test(model, train, test): train_pitch, train_rhythm = utils.read_files(train, True) test_pitch, test_rhythm = utils.read_files(test, True) p_distr = model.distribution(train_pitch) r_distr = model.distribution(train_rhythm) print("-Train:-") model.neg_log_prob(train_pitch, train_rhythm, p_distr, r_distr) model.predict(train_pitch, train_rhythm, p_distr, r_distr) print("\n") print("-Test:-") model.neg_log_prob(test_pitch, test_rhythm, p_distr, r_distr) model.predict(test_pitch, test_rhythm, p_distr, r_distr) print("\n")
def test_model(obs_file=OBS_FILE, length_file=LENGTH_FILE): obs, length = utils.read_files(obs_file, length_file) obs = utils.preprocess_data(obs) # model = joblib.load(MODEL_NAME+str(0)+".pkl") model = joblib.load("../interval_model/model_mbs.pkl") model._print_info() validate_model(model, obs[:, 0].reshape(-1, 1), length)
def discover(self, coverage, input_dir, mode='full'): tracer = pathfinder.Tracer(self, self.verbose) def run(inputs): cov = False if coverage == "full": cov = True traces = self.generate_traces(inputs) # dynamorio traces etraces = self.encode_traces(traces) # edge traces args = self.fix_args(self.args, inputs) # replace placeholders self.msg("[+] Tracing with: " + str(args)) concrete_traces = tracer.trace_concrete(args, traces, etraces, inputs) if len(concrete_traces) == 0: return length = tracer.trace_symbolic(args, concrete_traces, inputs) self.msg("[+] Done Tracing, discovered " + str(len(self.new)) + " new edges") self.__discover(length, cov) all_inputs = utils.read_files(input_dir) # inputs # we do this to create a way to get results quicker, sacrificing a complete coverage map for quick results # by not tracing everything and then running discover but tracing # single files and discovering on them if mode == 'full': run(all_inputs) else: #print all_inputs k, v = all_inputs.items()[random.randint(0, len(all_inputs) - 1)] one_input = dict() one_input[k] = v run(one_input)
def main(): test, emission, transition, output = read_files() emission, transition = get_nested_dictionaries(emission, transition) initial = transition["START"] prediction = [] print(emission) trellis = {} # initialize the trellis for tag in initial: trellis[tag] = trellis_cell(tag, initial[tag] * emission[tag][test[0][0]], None) last_cell = None i = 1 while i < len(test[0]): # Initialize next layer next_layer = {} # Loop through layer for next_tag in trellis: max_prob = 0 prev_cell = None # Loop through the current node layer for tag in trellis: # Calculate the edge probability prob = trellis[tag].val*transition[tag][next_tag] * \ emission[next_tag][test[0][i]] # Keep track of max probability if prob > max_prob: max_prob = prob prev_cell = trellis[tag] # Update node layer with the maximum probability next_layer[next_tag] = trellis_cell(next_tag, max_prob, prev_cell) # Find the max node value in the layer max_val = 0 tag_choice = "" cell_choice = None for tag in next_layer: if next_layer[tag].val > max_val: max_val = next_layer[tag].val tag_choice = tag cell_choice = next_layer[tag] trellis = next_layer last_cell = cell_choice i += 1 # Backtrace to find the optimal route i -= 1 while last_cell != None: prediction.insert(0, (test[0][i], last_cell.tag)) last_cell = last_cell.ptr i -= 1 print('Your Output is:', prediction, '\n Expected Output is:', output)
def main(): test, emission, transition, output = read_files() emission, transition = get_nested_dictionaries(emission, transition) initial = transition["START"] prediction = [] """WRITE YOUR VITERBI IMPLEMENTATION HERE""" print('Your Output is:', prediction, '\n Expected Output is:', output)
def get_stack_traces_for_signature(fnames, signature, traces_num=100): traces = download_stack_traces_for_signature(signature, traces_num) for line in utils.read_files(fnames): data = json.loads(line) if data['signature'] == signature: traces.add(data['proto_signature']) return list(traces)
def build_one_tree_dataset_from_xml(path, classe, max_depth): onlyfiles = read_files(path) data = [] for f in onlyfiles: G = Graph() G.build_Xml_tree(path + '/' + f, max_depth) data.append((G, classe)) return data
def get_patients_data(): OBS_FILE, LENGTH_FILE = utils.set_filename(LABEL, TYPE) obses, lengths = utils.read_files(OBS_FILE, LENGTH_FILE) assert len(obses) == len(lengths) patients = dict() Patient.Patient.vol = len(TYPE) print("vol: ", Patient.Patient.vol) Patient.Patient.trend_types = TREND_TYPE return pair_X_and_y_for_patients(obses, lengths, patients)
def main(parameters_file, bird_params, predict_dir, modelfile=None): p = yaml.safe_load(open(parameters_file, 'r')) p.update(yaml.safe_load(open(bird_params, 'r'))) sampled_dsets = read_files(predict_dir, load_events=False) spa = create_spectra(p) if modelfile is None: modelfile = default_model_filename(parameters_file, bird_params) model = keras.models.load_model(modelfile) print(model.summary()) for sampled_dset in sampled_dsets: predict(model, sampled_dset, spa, p)
def train_and_test_models(obs_file, length_file, n_split=N_SPLIT): # Read data obses, lengths = utils.read_files(obs_file, length_file) print(obs_file, length_file) assert len(obses) == len(lengths) for i in range(len(obses)): obs = obses[i] length = lengths[i][:, 1:].flatten() print("There are ", obs.shape[0], " observations and ", length.shape, " patients.") length = np.array_split(length, n_split) train_and_test_one_model(obs, length, i)
def get_path(path, children): files = read_files(path) if path: children.append(html.P(f"files found : {len(files)}")) embedding = Embedding() embs = np.array(embedding.embeddings(files)) matrix = similarity_matrix(embs, embs) index_pair = sort_matrix(matrix) np.save('index_pair.npy', index_pair) df = new_df(files) save_df(df, './files.csv') return children return []
def get_generator(trainingdir, spa, params): encoder = sampling_encoder(params) sampled_dsets, event_dsets = read_files(trainingdir, params) for sampled_dset, event_dset in zip(sampled_dsets, event_dsets): data_gen = data_generator( spa, sampled_dset.data, window_len=params['window_len'], labels=event_dset.data, encoder=encoder, batch_size=1, # this is required for saving images amplitude_norm=params['amplitude_norm'], loop=False) yield from data_gen
def main(): root = Path("corpus-20090418") files = list(root.iterdir()) # files = files[:2] lsh = LSH(read_files(files)) # plagiarism_table = lsh.rough_jaccard_test() # task_format_print(files, plagiarism_table) # plagiarism_table_min_hash = lsh.min_hashing() # task_format_print(files, plagiarism_table_min_hash) plagiarism_table_min_hash = lsh.min_hashing(shingles_mode=True) task_format_print(files, plagiarism_table_min_hash)
def main(): if len(sys.argv) != 2: print("Usage: folder containing mel files") return 1 mels, keys = utils.read_files(sys.argv[1], "key") print("_______Key_ID_______") for l in range(1, 25): cv_test(mels, keys, l) print("Done.") return 0
def get_genotype_phenotype(self): ''' get the relevant genotype phenotype data ''' data = utils.read_files(self.gene_file) # remove nc? if self.closeness: nc_variants = utils.extract_nc_from_vcf(self.vep_vcf_file, self.closeness) utils.remove_noncoding(data, nc_variants) utils.cleanse_variants(data) # remove batch effect? if self.binom_cutoff: batch_artefacts = self.get_batch_artefacts(data, ) data = self.remove_batch_artefacts(data, batch_artefacts) return data
def read_corpus(fnames): elems = [] already_selected = set() for line in utils.read_files(fnames): data = json.loads(line) proto_signature = data['proto_signature'] if should_skip(proto_signature): continue processed = preprocess(proto_signature) if frozenset(processed) not in already_selected: elems.append((processed, data['signature'])) already_selected.add(frozenset(processed)) return [gensim.models.doc2vec.TaggedDocument(trace, [i, signature]) for i, (trace, signature) in enumerate(elems)]
def main(): test, emission, transition, output = read_files() emission, transition = get_nested_dictionaries(emission, transition) initial = transition["S"] prediction = [] """WRITE YOUR VITERBI IMPLEMENTATION HERE""" # print("emission: ",emission) # print() # print("transition: ",transition) # print() backtrack = {} probs = {} for tag in initial: probs[(0, tag)] = math.log(initial[tag]) + math.log( emission[tag][test[0][0]]) backtrack[(0, tag)] = 0 print(probs) for message in test: for i in range(1, len(message)): word = message[i] for tag in emission: rates = float("-inf") best = "" for prev in transition[tag]: likelihood = math.log(transition[prev][tag]) + probs[ (i - 1, prev)] + math.log(emission[tag][word]) if rates < likelihood: rates = likelihood best = prev backtrack[(i, tag)] = (i - 1, best) probs[(i, tag)] = rates best = "" rate = float("-inf") for tag in emission: if rate < probs[len(message) - 1, tag]: best = tag rate = probs[len(message) - 1, tag] i = len(message) - 1 tuple = (i, best) while i >= 0: prediction.insert(0, (message[i], tuple[1])) tuple = backtrack[tuple] i -= 1 print('Your Output is:', prediction, '\n Expected Output is:', output)
def get_stack_traces_for_signature(fnames, signature, traces_num=100): traces = set() # query stack traces online url = 'https://crash-stats.mozilla.com/api/SuperSearch' params = { 'signature': '=' + signature, '_facets': ['proto_signature'], '_facets_size': traces_num, '_results_number': 0 } res = utils.get_with_retries(url, params) records = res.json()['facets']['proto_signature'] for record in records: traces.add(record['term']) # query stack traces from downloaded data for line in utils.read_files(fnames): data = json.loads(line) if data['signature'] == signature: traces.add(data['proto_signature']) return list(traces)
def dump_maybe(self, s, prefix="pathfinder"): ''' Concretize inputs and write to file, given a state. This only dumps the outputs when they are a) different from an existing output/input and b) the path depended on the input (there have to be constraints on it). ''' data = [] if not self.stdin: try: cur_data = s.posix.dump_file_by_path(self.sym_file_name) except BaseException: print("Could not dump file") return False else: cur_data = s.posix.dumps(0) if len(cur_data) > 0: data.append(cur_data) if len(data) > 0: queue = utils.read_files(self.queue_dir) # compare to inputs to avoid doublettes data = self.filter_compare(queue, data) if len(data) > 0: #print("[+] Found "+ str(len(data)) + " unique! (Writing to disk..)" self.write_outputs(data, prefix) return True return False
def main(): """ Main program for ADI data reduction, configured with a call to adiparam.GetConfig(), which brings up a GUI to set parameters. The pipeline is currently designed for SEEDS data taken without an occulting mask. You must have scipy, numpy, pyephem, multiprocessing, and matplotlib installed to use this pipeline. """ parser = optparse.OptionParser(usage=__doc__) parser.add_option("-p", "--prefix", dest="prefix", default="HICA", help="Specify raw file name prefix (default=%default)") opts, args = parser.parse_args() exec_path = os.path.dirname(os.path.realpath(__file__)) filesetup, adipar, locipar = GetConfig(prefix=opts.prefix) nframes = len(filesetup.framelist) ngroup = 1 + int((nframes - 1) / locipar.max_n) flat = pyf.open(filesetup.flat) if filesetup.pixmask is not None: hotpix = pyf.open(filesetup.pixmask) else: hotpix = None dimy, dimx = pyf.open(filesetup.framelist[0])[-1].data.shape mem, ncpus, storeall = utils.config(nframes, dimy * dimx) if filesetup.scale_phot: x, y = np.meshgrid(np.arange(7) - 3, np.arange(7) - 3) window = (x**2 + y**2 < 2.51**2) * 1.0 window /= np.sum(window) ref_phot, ref_psf = photometry.calc_phot(filesetup, adipar, flat, hotpix, mem, window) else: ref_psf = None ref_phot = None ################################################################ # WCS coordinates are not reliable in HiCIAO data with the image # rotator off. Compute parallactic angle. Otherwise, trust the # WCS coordinates. ################################################################ if 'HICA' in filesetup.framelist[0]: pa = np.asarray([transform.get_pa(frame) * -1 * np.pi / 180 for frame in filesetup.framelist]) else: pa = np.ones(len(filesetup.framelist)) for i in range(len(filesetup.framelist)): cd2_1 = pyf.open(filesetup.framelist[i])[0].header['cd2_1'] cd2_2 = pyf.open(filesetup.framelist[i])[0].header['cd2_2'] pa[i] = -np.arctan2(cd2_1, cd2_2) fullframe = re.sub("-C.*fits", ".fits", filesetup.framelist[0]) try: objname = pyf.open(fullframe)[0].header['OBJECT'] except: objname = "Unknown_Object" objname = re.sub(' ', '_', objname) np.savetxt(filesetup.output_dir + '/' + objname + '_palist.dat', pa) dr_rms = None #################################################################### # Default save/resume points: destriping, recentering, final files # Configuration gives the option to skip the destriping step (only # performing a flat-field), the dewarping, and the centering. #################################################################### if np.all(utils.check_files(filesetup, ext="_r")): print "\nResuming reduction from recentered files." if ngroup == 1: flux = utils.read_files(filesetup, ext="_r") else: flux = utils.read_files(filesetup, ext="_r") else: if storeall and np.all(utils.check_files(filesetup, ext="_ds")): flux = utils.read_files(filesetup, ext="_ds") elif not np.all(utils.check_files(filesetup, ext="_ds")): flux = parallel._destripe(filesetup, flat, hotpix, mem, adipar, write_files=True, storeall=storeall, full_destripe=adipar.full_destripe, do_horiz=adipar.full_destripe) else: flux = None if adipar.dewarp: flux = parallel._dewarp(filesetup, mem, flux=flux, storeall=storeall) if adipar.do_centroid: centers, dr_rms = centroid.fit_centroids(filesetup, flux, pa, storeall=storeall, objname=objname, method=adipar.center, psf_dir=exec_path+'/psfref', ref_psf=ref_psf) #centers = np.ndarray((nframes, 2)) #centers[:, 0] = 1026 - 128 #centers[:, 1] = 949 + 60 #dr_rms = 30 np.savetxt(filesetup.output_dir + '/' + objname + '_centers.dat', centers) #################################################################### # Recenter the data onto a square array of the largest dimension # such that the entire array has data #################################################################### mindim = min(dimy - centers[:, 0].max(), centers[:, 0].min(), dimx - centers[:, 1].max(), centers[:, 1].min()) mindim = int(mindim) * 2 - 1 flux = parallel._rotate_recenter(filesetup, flux, storeall=storeall, centers=centers, newdimen=mindim, write_files=True) nframes = len(filesetup.framelist) #################################################################### # Perform scaled PCA on the flux array; alternatively, read in an # array of principal components. Neither is currently used. #################################################################### if False: pcapath = '/scr/wakusei1/users/tbrandt' flux, pca_arr = pca.pca(flux, ncomp=20, nread=2, dosub=True, pcadir=pcapath + '/psfref') for i in range(nframes): out = pyf.HDUList(pyf.PrimaryHDU(flux[i].astype(np.float32), pyf.open(filesetup.framelist[i])[0].header)) rootfile = re.sub('.*/', '', filesetup.framelist[i]) out.writeto(filesetup.reduce_dir + '/' + re.sub('.fits', '_r.fits', rootfile), clobber=True) if dr_rms is None: dr_rms = 20 elif False: pca_dir = '.' npca = 40 pca_arr = np.zeros((npca, flux.shape[1], flux.shape[2]), np.float32) for i in range(npca): tmp = pyf.open(pca_dir + '/pcacomp_' + str(i) + '.fits')[0].data dy, dx = [tmp.shape[0] // 2, tmp.shape[1] // 2] pca_arr[i, yc - dy:yc + dy + 1, xc - dx:xc + dx + 1] = tmp else: pca_arr = None #################################################################### # Find the n closest matches to each frame. Not currently used. #################################################################### if False: corr = pca.allcorr(range(int(locipar.rmax)), flux, n=80) ngroup = 1 else: corr = None #################################################################### # Subtract a radial profile from each frame. Not currently used. #################################################################### if False: flux = parallel._radialsub(filesetup, flux, mode='median', center=None, rmax=None, smoothwidth=0) #################################################################### # Run LOCI if that ADI reduction method is chosen #################################################################### partial_sub = None full_pa = pa.copy() full_framelist = [frame for frame in filesetup.framelist] for igroup in range(ngroup): if ngroup > 1: filesetup.framelist = full_framelist[igroup::ngroup] if np.all(utils.check_files(filesetup, ext="_r")): flux = utils.read_files(filesetup, ext="_r") else: print "Unable to read recentered files for LOCI." sys.exit() pa = full_pa[igroup::ngroup] x = np.arange(flux.shape[1]) - flux.shape[1] // 2 x, y = np.meshgrid(x, x) r = np.sqrt(x**2 + y**2) if adipar.adi == 'LOCI': ################################################################ # Set the maximum radius at which to perform LOCI ################################################################ deltar = np.sqrt(np.pi * locipar.fwhm**2 / 4 * locipar.npsf) rmax = int(flux.shape[1] // 2 - deltar - 50) locipar.rmax = min(locipar.rmax, rmax) if dr_rms is None: nf, dy, dx = flux.shape fluxmed = np.median(flux, axis=0)[dy // 2 - 100:dy // 2 + 101, dx // 2 - 100:dx // 2 + 101] sat = fluxmed > 0.7 * fluxmed.max() r2 = r[dy//2 - 100:dy//2 + 101, dx//2 - 100:dx//2 + 101]**2 dr_rms = np.sqrt(np.sum(r2 * sat) / np.sum(sat)) ################################################################ # This is regular LOCI ################################################################ if locipar.feedback == 0: partial_sub = loci.loci(flux, pa, locipar, mem, mode='LOCI', pca_arr=None, r_ex=dr_rms, corr=corr, method='matrix', do_partial_sub=True, sub_dir=exec_path) ################################################################ # The next block runs LOCI once, de-rotates, takes the median, # and re-rotates to each frame's position angle. It then runs # LOCI again to over-correct the result. Not recommended for # SEEDS data with AO188. ################################################################ else: fluxref = np.ndarray(flux.shape, np.float32) fluxref[:] = flux loci.loci(fluxref, pca_arr, pa, locipar, mem, mode='LOCI', r_ex=dr_rms, pca_arr=pca_arr, corr=corr, method='matrix', do_partial_sub=False) for i in range(flux.shape[0]): np.putmask(fluxref[i], r > locipar.rmax - 1, 0) np.putmask(fluxref[i], r < dr_rms + 1, 0) locipar.rmax -= 100 fluxref = parallel._rotate_recenter(filesetup, fluxref, theta=pa) for i in range(flux.shape[0]): np.putmask(fluxref[i], r > locipar.rmax - 1, 0) np.putmask(fluxref[i], r < dr_rms + 1, 0) locipar.rmax -= 100 fluxmed = np.median(fluxref, axis=0) for i in range(flux.shape[0]): fluxref[i] = fluxmed * locipar.feedback fluxref = parallel._rotate_recenter(filesetup, fluxref, theta=-pa) loci.loci(flux, pa, locipar, mem, mode='refine', fluxref=fluxref, pca_arr=pca_arr, rmin=dr_rms, r_ex=dr_rms) ################################################################ # Mask saturated areas (< dr_rms), do median subtraction at radii # beyond the limit of the LOCI reduction ################################################################ fluxmed = np.median(flux, axis=0) for i in range(flux.shape[0]): np.putmask(flux[i], r < dr_rms + 2, 0) np.putmask(flux[i], r > locipar.rmax - 1, flux[i] - fluxmed) #################################################################### # Alternative to LOCI: median PSF subtraction #################################################################### elif adipar.adi == 'median': medpsf = np.median(flux, axis=0) for i in range(flux.shape[0]): flux[i] -= medpsf else: print "Error: ADI reduction method " + adipar.adi + " not recognized." #sys.exit(1) #################################################################### # Derotate, combine flux array using mean/median hybrid (see # Brandt+ 2012), measure standard deviation at each radius #################################################################### if igroup == 0: newhead = utils.makeheader(flux[0], pyf.open(fullframe)[0].header, full_framelist, adipar, locipar) flux = parallel._rotate_recenter(filesetup, flux, theta=pa) fluxtmp, noise = combine.meanmed(flux) fluxbest = fluxtmp / ngroup if partial_sub is not None: partial_sub_tot = partial_sub / ngroup else: flux = parallel._rotate_recenter(filesetup, flux, theta=pa) fluxtmp, noise = combine.meanmed(flux) fluxbest += fluxtmp / ngroup if partial_sub is not None: partial_sub_tot += partial_sub / ngroup filesetup.framelist = full_framelist if partial_sub is not None: partial_sub = partial_sub_tot #################################################################### # Rescale all arrays to 2001x2001 so that the center is pixel number # (1000, 1000) indexed from 0. Use NaN to pad arrays. #################################################################### fluxbest = utils.arr_resize(fluxbest) if partial_sub is not None: partial_sub = utils.arr_resize(partial_sub, newdim=fluxbest.shape[0]).astype(np.float32) fluxbest /= partial_sub out = pyf.HDUList(pyf.PrimaryHDU(partial_sub)) out.writeto('partial_sub2.fits', clobber=True) x, y = np.meshgrid(np.arange(7) - 3, np.arange(7) - 3) window = (x**2 + y**2 < 2.51**2) * 1.0 window /= np.sum(window) fluxbest = signal.convolve2d(fluxbest, window, mode='same') noise = combine.radprof(fluxbest, mode='std', smoothwidth=2, sigrej=4.5)[0] r = utils.arr_resize(r) if dr_rms is not None: np.putmask(fluxbest, r < dr_rms + 3, np.nan) np.putmask(fluxbest, r > locipar.rmax - 2, np.nan) fluxsnr = (fluxbest / noise).astype(np.float32) #################################################################### # 5-sigma sensitivity maps--just multiply by the scaled aperture # photometry of the central star #################################################################### if partial_sub is not None: sensitivity = noise * 5 / partial_sub #################################################################### # Photometry of the central star #################################################################### if filesetup.scale_phot: #ref_phot = photometry.calc_phot(filesetup, adipar, flat, # hotpix, mem, window)[0] sensitivity /= ref_phot fluxbest /= ref_phot noise /= ref_phot sig_sens = combine.radprof(sensitivity, mode='std', smoothwidth=0)[0] outfile = open(filesetup.output_dir + '/' + objname + '_5sigma_sensitivity.dat', 'w') for i in range(sig_sens.shape[0] // 2, sig_sens.shape[0]): iy = sig_sens.shape[0] // 2 if np.isfinite(sensitivity[iy, i]): outfile.write('%8d %12.5e %12.5e %12e\n' % (i - iy, sensitivity[iy, i], sig_sens[iy, i], partial_sub[iy, i])) outfile.close() else: np.savetxt(filesetup.output_dir + '/' + objname + '_noiseprofile.dat', noise[noise.shape[0] // 2, noise.shape[1] // 2:].T) #################################################################### # Write the output fits files. #################################################################### snr = pyf.HDUList(pyf.PrimaryHDU(fluxsnr.astype(np.float32), newhead)) final = pyf.HDUList(pyf.PrimaryHDU(fluxbest.astype(np.float32), newhead)) if partial_sub is not None: contrast = pyf.HDUList(pyf.PrimaryHDU(sensitivity.astype(np.float32), newhead)) name_base = filesetup.output_dir + '/' + objname snr.writeto(name_base + '_snr.fits', clobber=True) final.writeto(name_base + '_final.fits', clobber=True) if partial_sub is not None: contrast.writeto(name_base + '_5sigma_sensitivity.fits', clobber=True)
def generalGreedy_node_parallel(filename, G, budget, h_l, gamma1, gamma2, beta1=1.0, beta2=1.0, type_algo=1): ''' Finds initial seed set S using general greedy heuristic Input: G -- networkx Graph object k -- number of initial nodes needed p -- propagation probability Output: S -- initial set of k nodes to propagate ''' # import time # start = time.time() # R = 200 # number of times to run Random Cascade S = [] # set of selected nodes influenced = [] influenced_a = [] influenced_b = [] influenced_c = [] seeds_a = [] seeds_b = [] seeds_c = [] seed_range = [] if type_algo == 1: filename = filename + '_greedy_' elif type_algo == 2: filename = filename + '_log_gamma_{gamma1,gamma2}_' elif type_algo == 3: filename = filename + '_root_gamma_{gamma1}_beta_{beta1,beta2}_' elif type_algo == 4: filename = filename + '_root_majority_gamma_{gamma1}_beta_{beta1,beta2}_' stats = ut.graph_stats(G, print_stats=False) try: influenced, influenced_a, influenced_b, influenced_c, seeds_a, seeds_b, seeds_c = ut.read_files( filename) S = seeds_a[-1] + seeds_b[-1] + seeds_c[-1] if len(S) >= budget: # ut.write_files(filename,influenced, influenced_a, influenced_b, seeds_a, seeds_b) print(influenced_a) print("\n\n") print(influenced_b) print("\n\n") print(influenced_c) print(" Seed length ", len(S)) ut.plot_influence(influenced_a, influenced_b, influenced_c, len(S), filename, stats['group_a'], stats['group_b'], stats['group_c'], [len(S_a) for S_a in seeds_a], [len(S_b) for S_b in seeds_b], [len(S_c) for S_c in seeds_c]) return (influenced, influenced_a, influenced_b, influenced_c, seeds_a, seeds_b, seeds_c) else: seed_range = range(budget - len(S)) except FileNotFoundError: print('{filename} not Found ') seed_range = range(budget) # add node to S if achieves maximum propagation for current chosen + this node for i in seed_range: # cannot parallellize pool = multiprocessing.Pool(multiprocessing.cpu_count()) # results = None if type_algo == 1: results = pool.starmap( map_select_next_seed_set_cover, zip(repeat(G), repeat(S), list(G.nodes()), repeat(h_l))) # results = pool.map(map_select_next_seed_greedy, ((G, S, v,h_l) for v in G.nodes())) elif type_algo == 2: results = pool.map(map_select_next_seed_log_greedy, ((G, S, v, gamma1, gamma2) for v in G.nodes())) elif type_algo == 3: results = pool.map(map_select_next_seed_root_greedy, ((G, S, v, gamma1, beta1, beta2) for v in G.nodes())) elif type_algo == 4: results = pool.map(map_select_next_seed_root_majority_greedy, ((G, S, v, gamma1) for v in G.nodes())) pool.close() pool.join() s = PQ() # priority queue # if results == None: for v, priority, p_a, p_b, p_c in results: # run R times Random Cascade The gain of parallelizing isn't a lot as the one runIC is not very complex maybe for huge graphs s.add_task(v, -priority) node, priority = s.pop_item() S.append(node) I, I_a, I_b, I_c = map_fair_IC((G, S, h_l)) influenced.append(I) influenced_a.append(I_a) influenced_b.append(I_b) influenced_c.append(I_c) S_red = [] S_blue = [] S_purple = [] group = G.nodes[node]['color'] print( str(i + 1) + ' Selected Node is ' + str(node) + ' group ' + str(group) + ' Ia = ' + str(I_a) + ' Ib = ' + str(I_b) + ' Ic = ' + str(I_c)) for n in S: if G.nodes[n]['color'] == 'red': S_red.append(n) if G.nodes[n]['color'] == 'blue': S_blue.append(n) else: S_purple.append(n) seeds_a.append( S_red) # id's of the seeds so the influence can be recreated seeds_b.append(S_blue) seeds_c.append(S_purple) # print(i, k, time.time() - start) # print ( "\n \n I shouldn't be here. ********* \n \n ") ut.plot_influence(influenced_a, influenced_b, influenced_c, len(S), filename, stats['group_r'], stats['group_b'], stats['group_n'], [len(S_a) for S_a in seeds_a], [len(S_b) for S_b in seeds_b], [len(S_c) for S_c in seeds_c]) ut.write_files(filename, influenced, influenced_a, influenced_b, influenced_c, seeds_a, seeds_b, seeds_c) return (influenced, influenced_a, influenced_b, influenced_c, seeds_a, seeds_b, seeds_c)
def run_classifier(): u.create_required_directories() """ Get the data """ x_train, train_paths = u.read_files(train_data_path, training_subjects, folder_name) x_test, test_paths = u.read_files(test_data_path, testing_subjects, folder_name) x_validation, val_paths = u.read_files(val_data_path, testing_subjects, folder_name) x_train = np.array(x_train) x_test = np.array(x_test) x_validation = np.array(x_validation) print(x_train.shape, x_test.shape, x_validation.shape) np.random.shuffle(x_train) x_train, y_train, _ = u.split_x_y(x_train) x_test, y_test, y_test_labeled = u.split_x_y(x_test) x_validation, y_validation, _ = u.split_x_y(x_validation) # x_train = u.pad_rows(x_train, 60, True) # x_test = u.pad_rows(x_test, 60, True) # x_validation = u.pad_rows(x_validation, 60, True) print(x_train.shape, x_test.shape) print(y_train.shape, y_test.shape) """ Get the model """ # model = m.simple_rnn_model(len(labels), x_train.shape) model = m.simple_rnn_model(len(labels), len(x_train[0])) print(model.summary()) adam = Adam(lr=learning_rate) # Compile model model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # Fit the model model = u.train_model(model, x_train, y_train, x_validation, y_validation, time_stamp, epochs, batch_size) """ Evaluate the model """ prediction_y = model.predict(x_test) pred = np.array(prediction_y) pred = pred.argmax(1) pred = np.array(pred).astype(int) y_test_labeled = np.array(y_test_labeled).astype(int) cf = confusion_matrix(y_test_labeled, pred) print(cf) with open("confusion_matrix/{}".format(time_stamp), "w") as file: sentences = [] for line in cf: s = "" for item in line: s += "{},".format(item) sentences.append(s + "\n") file.writelines(sentences) print(labels) print(classification_report(y_test_labeled, pred)) scores = model.evaluate(x_test, y_test) print(scores) print("Accuracy: %.2f%%" % (scores[1] * 100)) if load_checked and model_path != "": print("Initiating Model: " + model_path) print(train_paths) print(test_paths) print(folder_name, epochs, batch_size, learning_rate, time_stamp)
def __init__(self, args): np.random.seed(args.seed) torch.manual_seed(args.seed) self.input_path = args.input_path self.lr = args.lr self.l2 = args.l2 self.dr = args.dr self.momentum = args.momentum self.train_portion = args.train_portion self.data_type = args.data_type self.n_epoch = args.n_epoch self.batch_size = args.batch_size self.thr = args.thr self.gpu = args.gpu self.mode = args.mode self.net_type = args.net self.output = Output(args) self.sprint = self.output.sprint self.dprint = self.output.dprint self.best_loss = np.inf files = ut.read_files(self.input_path, self.data_type) if self.mode == 'debug': files = files[:min(100, len(files))] np.random.shuffle(files) self.input_path = self.input_path self.n_file = len(files) self.n_train = int(self.train_portion * self.n_file) self.n_test = self.n_file - self.n_train self.train_files = files[:self.n_train] self.test_files = files[self.n_train:] self.train_data = Data(args.input_path, self.train_files, args.batch_size, args.thr) self.test_data = Data(args.input_path, self.test_files, args.batch_size, args.thr) print('Training set: %i photos\nTest set: %i photos' % (self.n_train, self.n_test)) if args.act_fn == 'relu': self.act_fn = F.relu elif args.act_fn == 'sigmoid': self.act_fn = F.sigmoid else: # none self.act_fn = lambda x: x if self.net_type == 'cnn': self.net_ = CNN_Net(self.act_fn, self.dr) else: # feature self.net_ = Feature_Net(self.thr) self.criterion_ = nn.MSELoss() if self.gpu > -1: self.net = self.net_.cuda(self.gpu) self.criterion = self.criterion_.cuda(self.gpu) else: self.net = self.net_ self.criterion = self.criterion_ if args.optim == 'adagrad': self.optim = torch.optim.Adagrad(self.net.parameters(), lr=self.lr, weight_decay=self.l2, initial_accumulator_value=args.x) elif args.optim == 'adam': self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr, weight_decay=self.l2) else: # sgd self.optim = torch.optim.SGD(self.net.parameters(), lr=self.lr, weight_decay=self.l2, momentum=self.momentum) print(self.net)
def main(): """ Main program for ADI data reduction, configured with a call to adiparam.GetConfig(), which brings up a GUI to set parameters. The pipeline is currently designed for SEEDS data taken without an occulting mask. You must have scipy, numpy, pyephem, multiprocessing, and matplotlib installed to use this pipeline. """ parser = optparse.OptionParser(usage=__doc__) parser.add_option("-p", "--prefix", dest="prefix", default="HICA", help="Specify raw file name prefix (default=%default)") opts, args = parser.parse_args() exec_path = os.path.dirname(os.path.realpath(__file__)) filesetup, adipar, locipar = GetConfig(prefix=opts.prefix) nframes = len(filesetup.framelist) ngroup = 1 + int((nframes - 1) / locipar.max_n) flat = pyf.open(filesetup.flat) if filesetup.pixmask is not None: hotpix = pyf.open(filesetup.pixmask) else: hotpix = None dimy, dimx = pyf.open(filesetup.framelist[0])[-1].data.shape mem, ncpus, storeall = utils.config(nframes, dimy * dimx) if filesetup.scale_phot: x, y = np.meshgrid(np.arange(7) - 3, np.arange(7) - 3) window = (x**2 + y**2 < 2.51**2) * 1.0 window /= np.sum(window) ref_phot, ref_psf = photometry.calc_phot(filesetup, adipar, flat, hotpix, mem, window) else: ref_psf = None ref_phot = None ################################################################ # WCS coordinates are not reliable in HiCIAO data with the image # rotator off. Compute parallactic angle. Otherwise, trust the # WCS coordinates. ################################################################ if 'HICA' in filesetup.framelist[0]: pa = np.asarray([ transform.get_pa(frame) * -1 * np.pi / 180 for frame in filesetup.framelist ]) else: pa = np.ones(len(filesetup.framelist)) for i in range(len(filesetup.framelist)): cd2_1 = pyf.open(filesetup.framelist[i])[0].header['cd2_1'] cd2_2 = pyf.open(filesetup.framelist[i])[0].header['cd2_2'] pa[i] = -np.arctan2(cd2_1, cd2_2) fullframe = re.sub("-C.*fits", ".fits", filesetup.framelist[0]) try: objname = pyf.open(fullframe)[0].header['OBJECT'] except: objname = "Unknown_Object" objname = re.sub(' ', '_', objname) np.savetxt(filesetup.output_dir + '/' + objname + '_palist.dat', pa) dr_rms = None #################################################################### # Default save/resume points: destriping, recentering, final files # Configuration gives the option to skip the destriping step (only # performing a flat-field), the dewarping, and the centering. #################################################################### if np.all(utils.check_files(filesetup, ext="_r")): print "\nResuming reduction from recentered files." if ngroup == 1: flux = utils.read_files(filesetup, ext="_r") else: flux = utils.read_files(filesetup, ext="_r") else: if storeall and np.all(utils.check_files(filesetup, ext="_ds")): flux = utils.read_files(filesetup, ext="_ds") elif not np.all(utils.check_files(filesetup, ext="_ds")): flux = parallel._destripe(filesetup, flat, hotpix, mem, adipar, write_files=True, storeall=storeall, full_destripe=adipar.full_destripe, do_horiz=adipar.full_destripe) else: flux = None if adipar.dewarp: flux = parallel._dewarp(filesetup, mem, flux=flux, storeall=storeall) if adipar.do_centroid: centers, dr_rms = centroid.fit_centroids(filesetup, flux, pa, storeall=storeall, objname=objname, method=adipar.center, psf_dir=exec_path + '/psfref', ref_psf=ref_psf) #centers = np.ndarray((nframes, 2)) #centers[:, 0] = 1026 - 128 #centers[:, 1] = 949 + 60 #dr_rms = 30 np.savetxt(filesetup.output_dir + '/' + objname + '_centers.dat', centers) #################################################################### # Recenter the data onto a square array of the largest dimension # such that the entire array has data #################################################################### mindim = min(dimy - centers[:, 0].max(), centers[:, 0].min(), dimx - centers[:, 1].max(), centers[:, 1].min()) mindim = int(mindim) * 2 - 1 flux = parallel._rotate_recenter(filesetup, flux, storeall=storeall, centers=centers, newdimen=mindim, write_files=True) nframes = len(filesetup.framelist) #################################################################### # Perform scaled PCA on the flux array; alternatively, read in an # array of principal components. Neither is currently used. #################################################################### if False: pcapath = '/scr/wakusei1/users/tbrandt' flux, pca_arr = pca.pca(flux, ncomp=20, nread=2, dosub=True, pcadir=pcapath + '/psfref') for i in range(nframes): out = pyf.HDUList( pyf.PrimaryHDU(flux[i].astype(np.float32), pyf.open(filesetup.framelist[i])[0].header)) rootfile = re.sub('.*/', '', filesetup.framelist[i]) out.writeto(filesetup.reduce_dir + '/' + re.sub('.fits', '_r.fits', rootfile), clobber=True) if dr_rms is None: dr_rms = 20 elif False: pca_dir = '.' npca = 40 pca_arr = np.zeros((npca, flux.shape[1], flux.shape[2]), np.float32) for i in range(npca): tmp = pyf.open(pca_dir + '/pcacomp_' + str(i) + '.fits')[0].data dy, dx = [tmp.shape[0] // 2, tmp.shape[1] // 2] pca_arr[i, yc - dy:yc + dy + 1, xc - dx:xc + dx + 1] = tmp else: pca_arr = None #################################################################### # Find the n closest matches to each frame. Not currently used. #################################################################### if False: corr = pca.allcorr(range(int(locipar.rmax)), flux, n=80) ngroup = 1 else: corr = None #################################################################### # Subtract a radial profile from each frame. Not currently used. #################################################################### if False: flux = parallel._radialsub(filesetup, flux, mode='median', center=None, rmax=None, smoothwidth=0) #################################################################### # Run LOCI if that ADI reduction method is chosen #################################################################### partial_sub = None full_pa = pa.copy() full_framelist = [frame for frame in filesetup.framelist] for igroup in range(ngroup): if ngroup > 1: filesetup.framelist = full_framelist[igroup::ngroup] if np.all(utils.check_files(filesetup, ext="_r")): flux = utils.read_files(filesetup, ext="_r") else: print "Unable to read recentered files for LOCI." sys.exit() pa = full_pa[igroup::ngroup] x = np.arange(flux.shape[1]) - flux.shape[1] // 2 x, y = np.meshgrid(x, x) r = np.sqrt(x**2 + y**2) if adipar.adi == 'LOCI': ################################################################ # Set the maximum radius at which to perform LOCI ################################################################ deltar = np.sqrt(np.pi * locipar.fwhm**2 / 4 * locipar.npsf) rmax = int(flux.shape[1] // 2 - deltar - 50) locipar.rmax = min(locipar.rmax, rmax) if dr_rms is None: nf, dy, dx = flux.shape fluxmed = np.median(flux, axis=0)[dy // 2 - 100:dy // 2 + 101, dx // 2 - 100:dx // 2 + 101] sat = fluxmed > 0.7 * fluxmed.max() r2 = r[dy // 2 - 100:dy // 2 + 101, dx // 2 - 100:dx // 2 + 101]**2 dr_rms = np.sqrt(np.sum(r2 * sat) / np.sum(sat)) ################################################################ # This is regular LOCI ################################################################ if locipar.feedback == 0: partial_sub = loci.loci(flux, pa, locipar, mem, mode='LOCI', pca_arr=None, r_ex=dr_rms, corr=corr, method='matrix', do_partial_sub=True, sub_dir=exec_path) ################################################################ # The next block runs LOCI once, de-rotates, takes the median, # and re-rotates to each frame's position angle. It then runs # LOCI again to over-correct the result. Not recommended for # SEEDS data with AO188. ################################################################ else: fluxref = np.ndarray(flux.shape, np.float32) fluxref[:] = flux loci.loci(fluxref, pca_arr, pa, locipar, mem, mode='LOCI', r_ex=dr_rms, pca_arr=pca_arr, corr=corr, method='matrix', do_partial_sub=False) for i in range(flux.shape[0]): np.putmask(fluxref[i], r > locipar.rmax - 1, 0) np.putmask(fluxref[i], r < dr_rms + 1, 0) locipar.rmax -= 100 fluxref = parallel._rotate_recenter(filesetup, fluxref, theta=pa) for i in range(flux.shape[0]): np.putmask(fluxref[i], r > locipar.rmax - 1, 0) np.putmask(fluxref[i], r < dr_rms + 1, 0) locipar.rmax -= 100 fluxmed = np.median(fluxref, axis=0) for i in range(flux.shape[0]): fluxref[i] = fluxmed * locipar.feedback fluxref = parallel._rotate_recenter(filesetup, fluxref, theta=-pa) loci.loci(flux, pa, locipar, mem, mode='refine', fluxref=fluxref, pca_arr=pca_arr, rmin=dr_rms, r_ex=dr_rms) ################################################################ # Mask saturated areas (< dr_rms), do median subtraction at radii # beyond the limit of the LOCI reduction ################################################################ fluxmed = np.median(flux, axis=0) for i in range(flux.shape[0]): np.putmask(flux[i], r < dr_rms + 2, 0) np.putmask(flux[i], r > locipar.rmax - 1, flux[i] - fluxmed) #################################################################### # Alternative to LOCI: median PSF subtraction #################################################################### elif adipar.adi == 'median': medpsf = np.median(flux, axis=0) for i in range(flux.shape[0]): flux[i] -= medpsf else: print "Error: ADI reduction method " + adipar.adi + " not recognized." #sys.exit(1) #################################################################### # Derotate, combine flux array using mean/median hybrid (see # Brandt+ 2012), measure standard deviation at each radius #################################################################### if igroup == 0: newhead = utils.makeheader(flux[0], pyf.open(fullframe)[0].header, full_framelist, adipar, locipar) flux = parallel._rotate_recenter(filesetup, flux, theta=pa) fluxtmp, noise = combine.meanmed(flux) fluxbest = fluxtmp / ngroup if partial_sub is not None: partial_sub_tot = partial_sub / ngroup else: flux = parallel._rotate_recenter(filesetup, flux, theta=pa) fluxtmp, noise = combine.meanmed(flux) fluxbest += fluxtmp / ngroup if partial_sub is not None: partial_sub_tot += partial_sub / ngroup filesetup.framelist = full_framelist if partial_sub is not None: partial_sub = partial_sub_tot #################################################################### # Rescale all arrays to 2001x2001 so that the center is pixel number # (1000, 1000) indexed from 0. Use NaN to pad arrays. #################################################################### fluxbest = utils.arr_resize(fluxbest) if partial_sub is not None: partial_sub = utils.arr_resize( partial_sub, newdim=fluxbest.shape[0]).astype(np.float32) fluxbest /= partial_sub out = pyf.HDUList(pyf.PrimaryHDU(partial_sub)) out.writeto('partial_sub2.fits', clobber=True) x, y = np.meshgrid(np.arange(7) - 3, np.arange(7) - 3) window = (x**2 + y**2 < 2.51**2) * 1.0 window /= np.sum(window) fluxbest = signal.convolve2d(fluxbest, window, mode='same') noise = combine.radprof(fluxbest, mode='std', smoothwidth=2, sigrej=4.5)[0] r = utils.arr_resize(r) if dr_rms is not None: np.putmask(fluxbest, r < dr_rms + 3, np.nan) np.putmask(fluxbest, r > locipar.rmax - 2, np.nan) fluxsnr = (fluxbest / noise).astype(np.float32) #################################################################### # 5-sigma sensitivity maps--just multiply by the scaled aperture # photometry of the central star #################################################################### if partial_sub is not None: sensitivity = noise * 5 / partial_sub #################################################################### # Photometry of the central star #################################################################### if filesetup.scale_phot: #ref_phot = photometry.calc_phot(filesetup, adipar, flat, # hotpix, mem, window)[0] sensitivity /= ref_phot fluxbest /= ref_phot noise /= ref_phot sig_sens = combine.radprof(sensitivity, mode='std', smoothwidth=0)[0] outfile = open( filesetup.output_dir + '/' + objname + '_5sigma_sensitivity.dat', 'w') for i in range(sig_sens.shape[0] // 2, sig_sens.shape[0]): iy = sig_sens.shape[0] // 2 if np.isfinite(sensitivity[iy, i]): outfile.write('%8d %12.5e %12.5e %12e\n' % (i - iy, sensitivity[iy, i], sig_sens[iy, i], partial_sub[iy, i])) outfile.close() else: np.savetxt(filesetup.output_dir + '/' + objname + '_noiseprofile.dat', noise[noise.shape[0] // 2, noise.shape[1] // 2:].T) #################################################################### # Write the output fits files. #################################################################### snr = pyf.HDUList(pyf.PrimaryHDU(fluxsnr.astype(np.float32), newhead)) final = pyf.HDUList(pyf.PrimaryHDU(fluxbest.astype(np.float32), newhead)) if partial_sub is not None: contrast = pyf.HDUList( pyf.PrimaryHDU(sensitivity.astype(np.float32), newhead)) name_base = filesetup.output_dir + '/' + objname snr.writeto(name_base + '_snr.fits', clobber=True) final.writeto(name_base + '_final.fits', clobber=True) if partial_sub is not None: contrast.writeto(name_base + '_5sigma_sensitivity.fits', clobber=True)
qque.extend(re.findall(r"\b(q)\b", clean_tweet, re.IGNORECASE)) # q = que xpor.extend(re.findall(r"\b(x)\b", clean_tweet, re.IGNORECASE)) # x = por dde.extend(re.findall(r"\b(d)\b", clean_tweet, re.IGNORECASE)) # d = de xqs.extend(re.findall(r"\b(xq)\b", clean_tweet, re.IGNORECASE)) # xq = porque pqs.extend(re.findall(r"\b(pq)\b", clean_tweet, re.IGNORECASE)) # pq = porque # clean_tweet = clean_tweet.translate(str.maketrans('', '', string.punctuation + '¡')) # PUNCTUATION return hashtags, urls, usernames, letReps, laughters, numbers, emojis, xpor, qque, dde, xqs, pqs sc = {'¡', '!', '?', '¿'} punctuation = ''.join([c for c in string.punctuation if c not in sc]) train_data, dev_data, test_data, valid_data = utils.read_files('all') hashtags, urls, usernames, letReps, laughters, numbers,emojis, xpor, qque, dde, xqs, pqs = print_preprocess(train_data['content']) hashtags_d, urls_d, usernames_d, letReps_d, laughters_d, numbers_d,emojis_d, xpor_d, qque_d, dde_d, xqs_d, pqs_d = print_preprocess(dev_data['content']) hashtags_t, urls_t, usernames_t, letReps_t, laughters_t, numbers_t,emojis_t, xpor_t, qque_t, dde_t, xqs_t, pqs_t = print_preprocess(test_data['content']) print('Intercesión de hashtags') counter = 0 train_hash = dict.fromkeys(hashtags) train_hash.update(dict.fromkeys(hashtags_d)) print('ht:{} hd:{} htest:{}'.format(len(hashtags), len(hashtags_d), len(hashtags_t))) for hash in hashtags_t: if hash not in train_hash: counter += 1 print(counter) print()
import scipy as scp import scipy.misc import tensorflow as tf import fcn16_vgg import loss import utils RESOURCE = '../dataset' MODEL_PATH = "./models/model.ckpt" logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO, stream=sys.stdout) dataset = utils.read_files(RESOURCE) random.shuffle(dataset) input_set, output_set = utils.split_dataset(dataset) np.save("input_set.npy", input_set) np.save("output_set.npy", output_set) # input_set = np.load("input_set.npy") # output_set = np.load("output_set.npy") train_input_set, train_output_set, test_input_set, test_output_set \ = utils.train_test_split(input_set, output_set, 0.1) train_input_set, train_output_set, valid_input_set, valid_output_set \ = utils.train_test_split(train_input_set, train_output_set, 0.1) height = input_set.shape[1]
def generalGreedy_node_set_cover(filename, G, budget, gamma_a=1e-2, gamma_b=0, type_algo=1): ''' Finds initial seed set S using general greedy heuristic Input: G -- networkx Graph object k -- fraction of population needs to be influenced in both groups p -- propagation probability Output: S -- initial set of k nodes to propagate ''' #import time #start = time.time() #R = 200 # number of times to run Random Cascade stats = ut.graph_stats(G, print_stats=False) if type_algo == 1: filename = filename + f'_set_cover_reach_{budget}_' elif type_algo == 2: filename = filename + f'_set_cover_timings_reach_{budget}_gamma_a_{gamma_a}_gamma_b_{gamma_b}_' elif type_algo == 3: filename = filename + f'_set_cover_timings_reach_{budget}_gamma_a_{gamma_a}_gamma_b_{gamma_a}_' reach = 0.0 S = [] # set of selected nodes # add node to S if achieves maximum propagation for current chosen + this node influenced = [] influenced_a = [] influenced_b = [] seeds_a = [] seeds_b = [] try: influenced, influenced_a, influenced_b, seeds_a, seeds_b = ut.read_files( filename) reach = min(influenced_a[-1] / stats['group_a'], budget) + min( influenced_b[-1] / stats['group_b'], budget) S = seeds_a[-1] + seeds_b[-1] if reach >= budget: #ut.write_files(filename,influenced, influenced_a, influenced_b, seeds_a, seeds_b) print(influenced_a) print("\n\n") print(influenced_b) print(f" reach: {reach}") ut.plot_influence(influenced_a, influenced_b, len(S), filename, stats['group_a'], stats['group_b'], [len(S_a) for S_a in seeds_a], [len(S_b) for S_b in seeds_b]) return (influenced, influenced_a, influenced_b, seeds_a, seeds_b) except FileNotFoundError: print(f'{filename} not Found ') i = 0 while reach < 2 * budget: # cannot parallellize pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1) if type_algo == 1: results = pool.map(map_select_next_seed_set_cover, ((G, S, v) for v in G.nodes())) elif type_algo == 2: results = pool.map(map_IC_timing, ((G, S, v, gamma_a, gamma_b) for v in G.nodes())) elif type_algo == 3: results = pool.map(map_IC_timing, ((G, S, v, gamma_a, gamma_a) for v in G.nodes())) pool.close() pool.join() s = PQ() # priority queue for v, p, p_a, p_b in results: # s.add_task( v, -(min(p_a / stats['group_a'], budget) + min(p_b / stats['group_b'], budget))) node, priority = s.pop_item() #priority = -priority # as the current priority is negative fraction S.append(node) I, I_a, I_b = map_fair_IC((G, S)) influenced.append(I) influenced_a.append(I_a) influenced_b.append(I_b) S_red = [] S_blue = [] group = G.nodes[node]['color'] for n in S: if G.nodes[n]['color'] == 'red': S_red.append(n) else: S_blue.append(n) seeds_a.append( S_red) # id's of the seeds so the influence can be recreated seeds_b.append(S_blue) #reach += -priority both are fine reach_a = I_a / stats['group_a'] reach_b = I_b / stats['group_b'] reach = (min(reach_a, budget) + min(reach_b, budget)) print( f'{i+1} Node ID {node} group {group} Ia = {I_a} Ib {I_b} reach: {reach} reach_a {reach_a} reach_b {reach_b}' ) #print(i, k, time.time() - start) i += 1 ut.plot_influence(influenced_a, influenced_b, len(S), filename, stats['group_a'], stats['group_b'], [len(S_a) for S_a in seeds_a], [len(S_b) for S_b in seeds_b]) ut.write_files(filename, influenced, influenced_a, influenced_b, seeds_a, seeds_b) return (influenced, influenced_a, influenced_b, seeds_a, seeds_b)
def run_classifier(): print("Using Model: {}".format(training_model)) u.create_required_directories() """ Step 1: Get the data """ x_train, train_paths = u.read_files(train_data_path, training_subjects, folder_name) x_test, test_paths = u.read_files(test_data_path, testing_subjects, folder_name) x_validation, val_paths = u.read_files(val_data_path, testing_subjects, folder_name) x_train = np.array(x_train) x_test = np.array(x_test) x_validation = np.array(x_validation) print(x_train.shape, x_test.shape, x_validation.shape) """ Step 1.1: Shuffle the training data """ np.random.shuffle(x_train) """ Step 1.2: Split x and y values """ x_train, y_train, _ = u.split_x_y(x_train) x_test, y_test, y_test_labeled = u.split_x_y(x_test) x_validation, y_validation, _ = u.split_x_y(x_validation) """ Step 2: Normalize the data """ mean_array = np.mean(x_train, axis=0) x_train -= mean_array x_test -= mean_array x_validation -= mean_array max_value = np.max(x_train) x_train /= float(max_value) x_test /= float(max_value) x_validation /= float(max_value) normalizing_values = { "mean_array": mean_array, "max_value": max_value } """ Step 2.2: Save the normalizing values for analysis """ u.save_object(normalizing_values, "normalizers/{}".format(time_stamp)) print(x_train.shape, x_test.shape) print(y_train.shape, y_test.shape) if training_model == "NN" or training_model == "LR": """ For Fully Connected Neural Nets and Logistic Regression, the dimension is required to be changed """ dimension = x_train.shape[1] * x_train.shape[2] * x_train.shape[3] x_train = x_train.reshape(x_train.shape[0], dimension).astype('float32') x_test = x_test.reshape(x_test.shape[0], dimension).astype('float32') x_validation = x_validation.reshape(x_validation.shape[0], dimension).astype('float32') """ Step 3: Get the model. """ model = u.get_model(x_train.shape, load_checked, model_path, len(labels), training_model) """ Step 3.1: Compile model """ adam = Adam(lr=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=u.get_metrics()) print(model.summary()) """ Step 4: Train the model. """ if not predict_only: model = u.train_model(model, x_train, y_train, x_validation, y_validation, time_stamp, epochs, batch_size) """ Step 4.1: After training reload the model depending on the best performance on the validation dataset """ if reload_model: model = u.load_checked_model(model_path, time_stamp) """ Step 5: Evaluate the model """ # Get the predictions on test set prediction_y = model.predict(x_test) pred = np.array(prediction_y) pred = pred.argmax(1) pred = np.array(pred).astype(int) y_test_labeled = np.array(y_test_labeled).astype(int) # Get and save the confusion matrix cf = confusion_matrix(y_test_labeled, pred) with open("confusion_matrix/{}".format(time_stamp), "w") as file: sentences = [] for line in cf: s = "" for item in line: s += "{},".format(item) sentences.append(s + "\n") file.writelines(sentences) print(labels) # Show the precision, recall and F1 measure for all the classes print(classification_report(y_test_labeled, pred)) # Get and save the top-5 predictions for each word in test p_array = [] for i in range(len(prediction_y)): p = prediction_y[i] top5 = np.argpartition(p, -5)[-5:] out = [labels[y_test_labeled[i]]] for j in top5: probability = p[j] prediction_label = labels[j] out.append((prediction_label, probability)) p_array.append(out) with open("model_plots/output_{}.csv".format(time_stamp), 'w') as resultFile: wr = csv.writer(resultFile) wr.writerows(p_array) # Calculate the accuracy on the test dataset scores = model.evaluate(x_test, y_test) print(scores) print("Accuracy: %.2f%%" % (scores[1] * 100)) if load_checked and model_path != "": print("Initiating Model: " + model_path) # Print some of the parameters related to the process print(train_paths) print(test_paths) print(folder_name, epochs, batch_size, learning_rate, time_stamp)
def test_read_files(self): paths = ['tests/test_utils.json'] for line in utils.read_files(paths): assert 'proto_signature' in line assert 'signature' in line assert 'uuid' in line