def evaluate_single_pred(url, url2, decoder_length=8): cr = Crawling() data = utils.load_file(url) if type(data) is list: data = np.asarray(data) lt = data.shape[0] * data.shape[1] data = np.reshape(data, (lt, 25)) dtl = len(data) labels = utils.load_file(url2) labels = np.asarray(labels) loss_mae = 0.0 loss_rmse = 0.0 r2_total = 0.0 for i, d in enumerate(data): pred_t = np.asarray(d).flatten() lb_i = i * pr.strides + 24 lbt = labels[lb_i:(lb_i + decoder_length), :, 0] lbg = lbt[decoder_length - 1, :].flatten() mae, mse, r2 = get_evaluation(pred_t, lbg) loss_mae += mae loss_rmse += mse r2_total += r2 utils.update_progress((i + 1.0) / dtl) loss_mae = loss_mae / lt * 300 loss_rmse = sqrt(loss_rmse / lt) * 300 r2_total = r2_total / lt print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae))) print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse))) print("R2 score: %.6f" % r2_total)
def main(limit=50):#len(ALL)): # def main(limit=len(ALL)): d = [a for a in (ALL.items())[:limit]] total = float(limit) out = [] i = 0 record_count = 0 print 'Exporting patrons...' ft = open(export_dir+'PATRONS.marc.txt', 'w') fb = open(export_dir+'PATRONS.marc.dat', 'wb') for (recid, record) in d: i = i + 1 if 'EXPIR' in record and record['EXPIR']=='None': expiration_date = datetime.strptime('9999-1-1', '%Y-%m-%d') elif 'EXPIR' in record: try: expiration_date = datetime.strptime(format_date(record['EXPIR'], '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d') except: expiration_date = datetime.strptime(format_date(record['EXPIR'], '%m/%d/%Y'), '%Y-%m-%d') if not 'EXPIR' in record or ('EXPIR' in record and expiration_date > datetime.now()): if 'PA' not in record or ('PA' in record and len(record['PA'])!=3): rec_binary = format_record(recid, record) fb.write(rec_binary.as_marc()) ft.write(str(rec_binary) + '\n==================\n') record_count = record_count + 1 if i > limit: break update_progress(i*100/total) fb.close() ft.close() print "\nPatrons exported: %d/%d" % (record_count, limit)
def main(limit=len(ALL)): keep_history = False if len(sys.argv) > 1: keep_history = sys.argv[1] == "keep_history" d = dict([a for a in ALL.items()[:limit]]) total = float(limit) out = [] i = 0 record_count = 0 # erase the log file with codecs.open(log_dir + "log_checkouts.txt", "w", encoding="utf8") as f: f.close() print "Exporting checkouts..." ft = open(export_dir + "CHECKOUTS.txt", "w") # fb = open(export_dir+'CHECKOUTS.marc.dat', 'wb') for (recid, record) in d.items(): i = i + 1 active_loan = not ("IDATE" in record) if keep_history or active_loan: rec_binary = format_record(recid) # fb.write(rec_binary.as_marc()) ft.write(str(rec_binary)) record_count = record_count + 1 if i > limit: break update_progress(i * 100 / total) # fb.close() ft.close() print "\nCheckouts exported: %d/%d %s" % ( record_count, limit, "(the rest are old loans)" if not keep_history else "", )
def create_manifest(data_path, tag, ordered=True): manifest_path = '%s_manifest.csv' % tag file_paths = [] wav_files = [ os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(data_path) for f in fnmatch.filter(files, '*.wav') ] size = len(wav_files) counter = 0 for file_path in wav_files: file_paths.append(file_path.strip()) counter += 1 update_progress(counter / float(size)) print('\n') if ordered: _order_files(file_paths) counter = 0 with io.FileIO(manifest_path, "w") as file: for wav_path in file_paths: ### Modified from utils.py to remove a replace step and add "_16k"... transcript_path = wav_path.replace('_16k.wav', '.txt') sample = os.path.abspath(wav_path) + ',' + os.path.abspath( transcript_path) + '\n' file.write(sample.encode('utf-8')) counter += 1 update_progress(counter / float(size)) print('\n')
def get_articles(folder, sitemap): last_index = get_last_index('crawling/%s' % folder) total = len(sitemap) for index, a in enumerate(sitemap): key = hash(a['link']) if not key in loaded: loaded[key] = 1 article = get_article_name(index + last_index) base = 'crawling/%s/%s' % (folder, article) try: r = requests.get(a['link'], timeout=p.url_timeout) # r = urllib2.urlopen(a['link']) html = Soup(r.text) title = html.find('h1') if title: title = getText(title) else: title = '' content = content_extractor.analyze(r.content) if len(content.split(' ')) >= p.min_length: # print([]) content = title.encode('utf=8') + '\n' + a['link'].encode('utf-8') + '\n' + content utils.save_file(base + '.txt', content, False) #get images get_images(base, a['images']) except requests.exceptions.Timeout: utils.save_file('cached.pkl', loaded) print("Timeout url: %s" % a['link']) except Exception as e: utils.save_file('cached.pkl', loaded) print("Error occured", e) utils.update_progress((index + 1) * 1.0 / total)
def upload_processed_files_to_s3(_context, local_directory_path, pruned_filekey): directory_key = config.S3_UPLOAD_PATH + "/" + pruned_filekey + "/" s3 = get_boto_client() # Create directory s3.put_object( ACL='public-read', Bucket=config.AWS_S3_BUCKET, Key=directory_key ) files = glob.glob(local_directory_path+"/*") for _idx, _file in enumerate(files): filename = _file.split("/")[-1] file_key = directory_key + filename resp = s3.put_object( ACL='public-read', Bucket=config.AWS_S3_BUCKET, Key=file_key, Body=open(_file).read() ) progress_step_offset = 33 + 25 + 12 progress_step_weight = (100 - (33 + 25 + 12))/100.0 percent_complete = (_idx * 1.0 / len(files)) * 100 update_progress( _context, (progress_step_offset + (progress_step_weight * percent_complete)) )
def __index_corpus(self): num_tokens = 0 num_sentences = 0 vocabs = {} sentence_offset = [] with open(self.path, 'r') as f: progress = 0 sentence_offset.append(f.tell()) line = f.readline() while line: num_sentences += 1 if not (num_sentences % self.block_size): sentence_offset.append(f.tell()) if not (num_sentences % 10000): progress = f.tell() * 1.0 / self.size utils.update_progress(progress, "Counting vocabs", 40) tokens = line.strip().split() num_tokens += len(tokens) list(map(lambda x: utils.inc_dict_value(vocabs, x), tokens)) line = f.readline() if progress < 1: utils.update_progress(1, "Counting vocabs", 40) return vocabs, sentence_offset, num_sentences
def main(limit=len(ALL)): d = dict([a for a in ALL.items()[:limit]]) total = float(limit) out = [] i = 0 record_count = 0 with codecs.open(log_dir + 'log_serials.txt', 'w', encoding='utf8') as f: f.close() with codecs.open(log_dir + 'unmatched_serials.txt', 'w', encoding='utf8') as f: f.close() print 'Exporting serials...' ft = open(export_dir+'SERIALS.marc.txt', 'w') fb = open(export_dir+'SERIALS.marc.dat', 'wb') for (recid, record) in d.items(): i = i + 1 # if 'ACTIV' in record: rec_binary = format_record(recid) fb.write(rec_binary.as_marc()) ft.write(str(rec_binary) + '\n==================\n') record_count = record_count + 1 if i > limit: break update_progress(i*100/total) fb.close() ft.close() print "\nSerials exported: %d/%d" % (record_count, limit)
def convert_midi_files_to_json(_context, filelist, pruned_filekey): converted_files = [] for _idx, _file in enumerate(filelist): f = open(_file, "r") data = f.read() f.close() encoded_data = base64.b64encode(data) _d = {} _d['dataUri'] = "data:audio/midi;base64,"+encoded_data _d['_idx'] = _idx _d['key'] = pruned_filekey new_filepath = _file.replace(".midi", ".json") f = open(new_filepath, "w") f.write(json.dumps(_d)) f.close() os.remove(_file) if not new_filepath.endswith("submission.json"): converted_files.append( new_filepath.replace( config.TEMP_STORAGE_DIRECTORY_PATH, "") ) progress_step_offset = 33 + 25 progress_step_weight = 0.12 percent_complete = (_idx * 1.0 / len(filelist)) * 100 update_progress( _context, (progress_step_offset + (progress_step_weight * percent_complete)) ) """ These filekeys are relative to the `S3_UPLOAD_PATH` in the bucket and the first item of the returned array is the main submission file. """ return [pruned_filekey+'/submission.json'] + converted_files
def main(): global d if not d: d = get_items('BARCD') total = float(len(d)) i = 1 fb = open(export_dir+'ITEMS.marc.dat', 'wb') ft = open(export_dir+'ITEMS.marc.txt', 'w') print 'Exporting items...' item_count = 0 for (recid, copies) in d.items(): if not is_staff_paper(recid): record = Record() id_field = Field(tag='999', indicators=[' ', ' '], subfields=['a', recid, 'b', ALL[recid].get('ID', '')]) record.add_ordered_field(id_field) for c in copies.items(): aux = [(e[0], items_fix[e[0]](e[1])) for e in c[1].items() if e[0] in items_fix] item_field = Field(tag='945', indicators=[' ', ' '], subfields= ['b', c[0]]+flatten_list(aux)) record.add_ordered_field(item_field) item_count = item_count + 1 fb.write(record.as_marc()) ft.write(str(record) + '\n==================\n') update_progress(i*100/total) i = i + 1 print "\nRecords:\t" + str(int(total)) print "Items: \t" + str(item_count) fb.close() ft.close()
def prepare_dir(ted_dir): converted_dir = os.path.join(ted_dir, "converted") # directories to store converted wav files and their transcriptions wav_dir = os.path.join(converted_dir, "wav") if not os.path.exists(wav_dir): os.makedirs(wav_dir) txt_dir = os.path.join(converted_dir, "txt") if not os.path.exists(txt_dir): os.makedirs(txt_dir) counter = 0 entries = os.listdir(os.path.join(ted_dir, "sph")) for sph_file in entries: speaker_name = sph_file.split('.sph')[0] sph_file_full = os.path.join(ted_dir, "sph", sph_file) stm_file_full = os.path.join(ted_dir, "stm", "{}.stm".format(speaker_name)) assert os.path.exists(sph_file_full) and os.path.exists(stm_file_full) all_utterances = get_utterances_from_stm(stm_file_full) all_utterances = filter(filter_short_utterances, all_utterances) for utterance_id, utterance in enumerate(all_utterances): target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id))) target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id))) cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"], sample_rate=args.sample_rate) with io.FileIO(target_txt_file, "w") as f: f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8')) counter += 1 update_progress(counter / float(len(entries)))
def compute_fleiss_kappa(num_categories, model_names, device, dataloader): num_subjects = len(dataloader.dataset) fleiss_input = np.zeros((num_subjects, num_categories)) for model_name in model_names: model = torch.load(model_name) model = model.to(device) model.eval() start = time.time() for i, (inputs, _) in enumerate(dataloader): inputs = inputs.to(device) with torch.no_grad(): outputs = model(inputs) for j, output in enumerate(outputs): fleiss_input[i * len(outputs) + j][output] += 1 if i % 100 == 0: update_progress(i / len(dataloader.dataset)) print( f'Time Elapsed: {time.time() - start:.0f}s, Accuracy: {accuracy:.2f}%' ) return fleiss_kappa(fleiss_input)
def _evaluate(self): evalYHat = np.zeros([self.gen.nb_samples, self.gen.nb_classes]) Y = np.zeros([self.gen.nb_samples, self.gen.nb_classes]) iterGen = self.gen.begin() s_idx = 0 if self.model is not None: for i in range(self.gen.nb_batches): utils.update_progress(i / self.gen.nb_batches) batch, y = next(iterGen) f_idx = s_idx + y.shape[-2] y_hat = self.model.predict_on_batch(x=batch) evalYHat[s_idx:f_idx, :] = y_hat Y[s_idx:f_idx, :] = y s_idx = f_idx else: evalYHat = self.yhat Y = self.y utils.update_progress(self.gen.nb_batches) print() accs, self.mP, self.mR, self.F1 = computeMultiLabelLoss(Y, evalYHat) self.mAP = computemAPLoss(Y, evalYHat) self.tp = accs[:, 1] self.fp = accs[:, 2] self.fn = accs[:, 3] self.precision = accs[:, 4] self.recall = accs[:, 5] self.nb_zeros = np.count_nonzero(accs[:, 1] == 0) self.Y_hat = evalYHat self.Y = Y
def update_list(self, url_list): total_items = len(url_list) if total_items: start_progress_dialog(True) ####################################################################################### # TODO: 02 CANCEL AND RETURN IF PROGRESS BAR CANCEL BUTTON IS PRESSED ####################################################################################### utils.update_progress(preset.message["loading_bookmarks"], -1, total_items) for index, url in enumerate(url_list): self.update_element(index, url) url_object = preset.Header() url_object.set_data(url) self.url_objects.append(url_object) ####################################################################################### # TODO: 03 LET USER CHANGE COLOR IN POPUP MENU https://wiki.wxpython.org/PopupMenuOnRightClick # TODO: 03 CHANGE COLOR IN POPUP MENU http://revxatlarge.blogspot.com/2011/06/wxpython-listbox-popupmenu.html # TODO: 03 CHANGE COLOR IN POPUP MENU https://www.daniweb.com/programming/software-development/threads/352474/wxpython-wx-listctrl-and-wx-menu ####################################################################################### if index % 2: self.list_ctrl.SetItemBackgroundColour(index, "#FFFFFF") else: self.list_ctrl.SetItemBackgroundColour(index, "#EEEEEE") utils.update_progress(preset.message["loading_bookmarks"], index, total_items) self.update_column_width() start_progress_dialog(False) else: set_status_message(self.parent, preset.message["user_has_no_bookmarks"])
def evaluate_sp(url, url2, decoder_length=24, is_grid=True, grid_eval=True): cr = Crawling() map_ = heatmap.build_map() data = utils.load_file(url) if type(data) is list: data = np.asarray(data) if len(data.shape) == 4: lt = data.shape[0] * data.shape[1] else: lt = data.shape[0] if is_grid: data = np.reshape(data, (lt, data.shape[-2], 25, 25)) else: data = np.reshape(data, (lt, data.shape[-2], 25)) labels = utils.load_file(url2) labels = np.asarray(labels) loss_mae = 0.0 loss_rmse = 0.0 r2_total = 0.0 for i, d in enumerate(data): d = d[:decoder_length, :, :] pred_t = [] if is_grid: for d_ in d: d_t = heatmap.clear_interpolate_bound(np.asarray(d_), map_) pred_t.append(d_t) else: if grid_eval: for d_ in d: d_t = heatmap.fill_map(d_, map_) pred_t.append(d_t) else: pred_t = d lb_i = i * pr.strides + 24 lbt = labels[lb_i:(lb_i + decoder_length), :, 0] if grid_eval: lbg = [] for x in lbt: x_l = heatmap.fill_map(x, map_) lbg.append(x_l) lbg = np.asarray(lbg) lbg = lbg.flatten() else: lbg = lbt.flatten() pred_t = np.asarray(pred_t) pred_t = pred_t.flatten() mae, mse, r2 = get_evaluation(pred_t, lbg) loss_mae += mae loss_rmse += mse r2_total += r2 utils.update_progress((i + 1.0) / lt) loss_mae = loss_mae / lt * 300 loss_rmse = sqrt(loss_rmse / lt) * 300 r2_total = r2_total / lt print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae))) print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse))) print("R2 Score: %.6f" % r2_total)
def mser_detect(img, x_len, y_len): utils.update_progress('Detecting Regions') min_t = int(math.floor((y_len * x_len) * 0.0009)) max_t = int(math.floor((y_len * x_len) * 0.05)) #MSER(5, 60, 14400, 0.25, 0.2, 200, 1.01, 0.003, 5) <- Default Values c_mser = cv2.MSER(5, min_t, max_t, 0.166, 0.153, 90, 1.001, 0.003, 5) c_regions = c_mser.detect(img, None) return [cv2.convexHull(p.reshape(-1, 1, 2)) for p in c_regions]
def evaluate_lstm(url, url2, decoder_length=24, forecast_factor=0, is_classify=False): data = utils.load_file(url) if type(data) is list: data = np.asarray(data) lt = data.shape[0] * data.shape[1] data = np.reshape(data, (lt, data.shape[-1])) if decoder_length > data.shape[-1]: decoder_length = data.shape[-1] dtl = len(data) labels = utils.load_file(url2) labels = np.asarray(labels) if not is_classify: loss_mae = [0.0] * decoder_length loss_rmse = [0.0] * decoder_length else: acc = 0. #: r2_total = 0.0 cr = Crawling() for i, d in enumerate(data): if decoder_length < data.shape[-1]: pred_t = d[:decoder_length] else: pred_t = d lb_i = i * pr.strides + 24 lbt = np.mean(labels[lb_i:(lb_i + decoder_length), :, forecast_factor], axis=1) a = 0. for t_i, (p, l) in enumerate(zip(pred_t, lbt)): if not is_classify: # mae, mse, _ = get_evaluation(p, l) mae = abs(cr.ConcPM10(p * 300) - cr.ConcPM10(l * 300)) loss_mae[t_i] += mae # loss_rmse[t_i] += mse else: a += classify_data(pred_t, lbt, forecast_factor) if is_classify: a = a / decoder_length acc += a # r2_total += r2 utils.update_progress((i + 1.0) / dtl) if not is_classify: loss_mae = np.array(loss_mae) / lt # loss_rmse = [sqrt(x / lt) * 300 for x in loss_rmse] # print("R2 score: %.6f" % r2_total) print_accumulate_error(loss_mae, loss_rmse, decoder_length, forecast_factor=forecast_factor) else: acc = acc / lt * 100 print("accuracy %.4f" % acc)
def parse_time_series(config: BaseConfig, well_manager: WellManager, data_directory: str): filepath = join(data_directory, config.filename) # Parse header # Keeping this generic, in case column orders change in subsequent data files infile = open(filepath) header_line = infile.readline() try: header_to_column = _get_column_indices( header_line, config.headers_to_read + [TimeSeriesHeaderTypes.api]) except KeyError: raise LookupError( "Header value specified in config was not found in data file!") current_api = -1 temp_lists = _get_temporary_lists(config.headers_to_read) all_lines = infile.readlines() total_lines = len(all_lines) parsed_lines = 0 # Parse data for l in all_lines: sp = l.replace("\"", "").split(',') api = sp[header_to_column[TimeSeriesHeaderTypes.api]] if api != current_api: #Write in large batches to minimize resizing of underlying numpy array _write_to_well(well_manager, api, temp_lists) temp_lists = _get_temporary_lists(config.headers_to_read) current_api = api try: date = datetime.strptime( sp[header_to_column[TimeSeriesHeaderTypes.date]], '%Y-%m-%d %H:%M:%S').timestamp( ) #Convert to seconds since linux epoch. Ignoring timezones oil_barrels = float( sp[header_to_column[TimeSeriesHeaderTypes.oil_barrels]]) water_barrels = float( sp[header_to_column[TimeSeriesHeaderTypes.water_barrels]]) gas_mcf = float( sp[header_to_column[TimeSeriesHeaderTypes.gas_mcf]]) except ValueError: continue temp_lists[TimeSeriesHeaderTypes.water_barrels].append(water_barrels) temp_lists[TimeSeriesHeaderTypes.date].append(date) temp_lists[TimeSeriesHeaderTypes.oil_barrels].append(oil_barrels) temp_lists[TimeSeriesHeaderTypes.gas_mcf].append(gas_mcf) parsed_lines += 1 if parsed_lines % 100 == 0: update_progress(parsed_lines / total_lines) _write_to_well(well_manager, api, temp_lists)
def monitor(files): global done a = 0 while True: incr = int(ceil((40 / files) * done)) while a < incr: update_progress(None, 1) a = a + 1 if a == 40 or BREAK or JOB_DONE: return continue
def ES(cost_func, lb, ub, num_parents, num_children, num_generations, mutation, run_name='runs'): mu = num_parents lam = num_children all_params = np.zeros([num_generations + 1, num_children, len(lb)]) all_costs = np.full([num_generations + 1, num_children], math.inf) P = np.zeros([mu, len(lb)]) Pcost = np.zeros([mu, 1]) print("Generating Parents") for i in range(mu): update_progress(i / (mu - 1)) P[i, :], Pcost[i] = generate_parent(cost_func, lb, ub) all_params[0, 0:num_parents, :] = P all_costs[0, 0:num_parents] = Pcost[:, 0] best_costs = np.zeros([num_generations, 1]) for g in range(num_generations): starttime = time.clock() print("Creating generation " + str(g)) G = np.zeros([lam, len(lb)]) Gcost = np.zeros([1, lam]) print("Starting generation " + str(g)) # generate the children for generation g for i in range(lam): G[i, :], Gcost[:, i] = generate_child_ES(cost_func, P, lb, ub, mutation) # assign parents for next generation # to do this, I must first sort the children of this generation idx = np.argsort(Gcost) Gcost = Gcost[:, idx] G = G[idx] all_params[g + 1, :, :] = G all_costs[g + 1, :] = Gcost Pcost = Gcost[0, :, 0:mu].T P = G[0, 0:mu, :] best_costs[g] = Pcost[np.argmin(Pcost)] endtime = time.clock() print("Generation {} runtime: {}".format(g, endtime - starttime)) print("Best of generation " + str(g) + " has cost " + str(best_costs[g])) np.savetxt(run_name + '/best_costs.csv', best_costs) return P, Pcost
def getLocalsForAllRegions(data, calc, history_lengths=None, delays=None, parameters=None, print_max_idx=True, compute_p=False): """ Calculates the local AIS for all regions, by calling getLocalsForRegion Arguments: data -- Numpy array of shape (region, time). Preprocessing should have already been performed calc -- The JIDT calculator history_lengths -- Range of possible history length values, or None delays -- Range of possible delay values, or None print_max_idx -- If True, prints the maximum average AIS value and the corresponding indices for the parameters. The first value gives the maximum index in the range of possible history lengths, and the second value gives the maximum index in the range of possible delays parameters -- A DataFrame or numpy array containing a column of history lengths and a column of delays Each row should correspond to a particular region compute_p -- If True, computes the p value of the returned AIS Returns: results -- A numpy array of shape (regions, timepoints), containing the local AIS values for each region all_parameters -- A numpy array with three columns, containing the (history_length, delay, DCE) of each region p_values -- A numpy array of all returned p values (or Nones if compute_p is False). Each row corresponds to a region """ regions, timepoints = data.shape # Initialise results = np.zeros((regions, timepoints)) all_parameters = np.zeros((regions, 3), dtype=int) p_values = np.zeros(regions) for region in range(regions): # Either parameters are provided, or the range of possible history lengths and delays should be provided if parameters is None: assert history_lengths is not None and delays is not None params = None else: if isinstance(parameters, pd.DataFrame): params = parameters.loc[region].values else: # Numpy array or list, etc params = parameters[region] results[region], _, params, p_values[region] = getLocalsForRegion( data, calc, region, history_lengths, delays, params, print_max_idx, compute_p) all_parameters[region] = np.array(params) utils.update_progress(region / regions) # Print progress bar return results, all_parameters, p_values
def check_answer(username='', assignment='', ipd=-1, answer=''): if not assignment in Global.progress[username]: Global.progress[username][assignment] = False utils.update_progress(username, Global.progress[username]) ipd_idx = Global.data_dict[assignment]['ipd'].index(ipd) x_answer = np.array(json.loads(answer)) x_true = Global.data_dict[assignment]['outputs'][ipd_idx] answer_is_correct = np.allclose(x_true, x_answer, atol=1e-5) if not Global.progress[username][assignment] and answer_is_correct: Global.progress[username][assignment] = answer_is_correct utils.update_progress(username, Global.progress[username]) return jsonify({'success': answer_is_correct})
def fem_movie(foldername='images/', T=20.0): y = np.array([2., 8.1, 2.45, 9]) dt = 1.0 / 30 vertices = np.array([[0., 0], [4, 0], [4, 10], [0, 10]]) mesh = finis.triangulate(vertices=vertices, max_area=0.001) fe_u = finis.fe_space(mesh, order=2, order_int=2) fe_p = finis.fe_space(mesh, order=1, order_int=2) u1_h, u2_h, p_h, dy = fem_solve(fe_u, fe_p, y, eps=1e-6) fig = plt.figure(figsize=(9, 5)) ax = fig.add_subplot(1, 1, 1) tricol = plt.tripcolor(fe_u['dof'][:, 0], fe_u['dof'][:, 1], u2_h, shading='flat', vmin=-2, vmax=2) c1 = plt.Circle((y[0], y[1]), .25, color='w') ax.add_artist(c1) c2 = plt.Circle((y[2], y[3]), .25, color='w') ax.add_artist(c2) ax.set_title("$u_y$") ax.set_xlabel("$x$") plt.colorbar() ax.set_aspect('equal') _i_max = int(T / dt) + 2 starttime = time.time() for _i in range(_i_max): if _i > 0: u1_h, u2_h, p_h, dy = fem_solve(fe_u, fe_p, y, eps=1e-6) ax.clear() c1 = plt.Circle((y[0], y[1]), .25, color='w') ax.add_artist(c1) c2 = plt.Circle((y[2], y[3]), .25, color='w') ax.add_artist(c2) tricol = plt.tripcolor(fe_u['dof'][:, 0], fe_u['dof'][:, 1], u2_h, shading='flat', vmin=-2, vmax=2) plt.savefig(foldername + 'movie{}.png'.format(_i)) np.save(foldername + 'y{}.npy'.format(_i), y) update_progress(_i / _i_max, starttime=starttime) y = y + dt * dy
def convert_data_to_grid(url, out_url, url_att="", out_url_att="", part=1): grid = heatmap.build_map(pr.map_size) data = utils.load_file(url) lt = len(data) attention_data = None att_part = None print(url_att) if url_att: attention_data = utils.load_file(url_att) alt = len(attention_data) if lt != alt: raise ValueError( "Attention & Main Data need same length while %s and %s" % (lt, alt)) data = zip(data, attention_data) att_part = [] res = [] if part != 1: bound = int(math.ceil(float(lt) / part)) else: bound = lt for i, row in enumerate(data): if url_att: t, a = row else: t = row if i and (i % bound) == 0: p_i = i / bound out_url_name = out_url + "_" + str(p_i) utils.save_file(out_url_name, res) if url_att: att_out_url_name = out_url_att + "_" + str(p_i) utils.save_file(att_out_url_name, att_part) res = [] att_part = [] g = heatmap.fill_map(t, grid) res.append(g) if url_att: att_part.append(a) utils.update_progress(float(i) / lt) if part == 1: out_url_name = out_url else: out_url_name = out_url + "_" + str(part) utils.save_file(out_url_name, res) if url_att: att_out_url_name = out_url_att + "_" + str(part) utils.save_file(att_out_url_name, att_part)
def execute(self, args): print("start crawling aws") save_interval = args.save_interval start = datetime.strptime(args.start, pr.fm) start_point = utils.get_datetime_now() # output = "timestamp,PM10_VAL,PM2.5_VAL,O3(ppm),NO2(ppm),CO(ppm),SO2(ppm),PM10_AQI,PM2.5_AQI\n" output = "" counter = 0 last_save = 0 crawler_range = 86400 if not args.forward: if args.end: end = datetime.strptime(args.end, pr.fm) else: end = utils.get_datetime_now() length = (end - start).total_seconds() / crawler_range else: end = datetime.strptime("2050-12-31 00:00:00", pr.fm) while start <= end: now = utils.get_datetime_now() # at first, crawling by daily # if up to the moment, crawling by hourly # how long from last crawled date to now? if (now - start).total_seconds() > crawler_range: tmp = start st = "00" ed = "24" if crawler_range != 86400: st = self.format10(tmp.hour) ed = self.format10(tmp.hour + 1) output, counter, last_save = self.craw_data_controller( output, counter, last_save, save_interval, tmp, st, ed) # move pointer for timestep if not args.forward: utils.update_progress(counter * 1.0 / length) else: self.write_log(output) output = "" if crawler_range == 86400: start = start + timedelta(days=1) else: start = start + timedelta(hours=1) print("AWS done") else: # Approach boundary (reach end) then reduce range to hourly crawling crawler_range = 3600 self.write_log(output)
def word_segment(root): directory = 'seg/%s' % get_container_folder(root) ut.create_folder(directory) files = [ f for f in os.listdir(root) if os.path.isfile('%s/%s' % (root, f)) ] total = len(files) for index, f in enumerate(files): path = '%s/%s' % (root, f) content = ut.load_file(path) if len(content) >= 3: title = content[0].replace('\n', '') par = content[2].replace('\n', '') title = ViTokenizer.tokenize(unicode(title, 'UTF-8')) par = ViTokenizer.tokenize(unicode(par, 'UTF-8')) ut.save_file_utf8('%s/%s' % (directory, f), title + '\n' + par) ut.update_progress((index + 1) * 1.0 / total)
def accuracy_generalization_matrix(model_names, datasets, device): num_models = len(model_names) num_datasets = len(datasets) result = np.zeros((num_models, num_datasets)) dataloaders = [ create_dataset(dataset[1], dataset[2], dataset[3], train=False) for dataset in datasets ] for i, model_name in enumerate(model_names): model = torch.load(model_name) model = model.to(device) model.eval() for j, (dataset, dataloader) in enumerate(zip(datasets, dataloaders)): start = time.time() print(f'Evaluating {model_name} on data {dataset[0]}...') for k, (inputs, labels, _) in enumerate(dataloader): corrects = 0 inputs = inputs.to(device) labels = labels.to(device) with torch.no_grad(): outputs = model(inputs) _, preds = torch.max(outputs, 1) corrects += torch.sum(preds == labels) if k % 100 == 0: update_progress(k / len(dataloader.dataset)) accuracy = corrects / len(dataloader.dataset) result[i][j] = accuracy print( f'Time Elapsed: {time.time() - start:.0f}s, Accuracy: {accuracy:.2f}%' ) print(f'---- Cross-Dataset Generalization ----') print(result) return result
def create_classifier(iterations=100): """ Return the classifier that did the best at classifying a subset of the data after training for the given number of iterations :param iterations: number of iterations to test on :return: tuple: (classifier, accuracy of classifier) """ negids = reddit_politics.fileids("neg") posids = reddit_politics.fileids("pos") negfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "neg") for f in negids] posfeats = [(word_feats(reddit_politics.words(fileids=[f]), FILTER_LIST), "pos") for f in posids] # track the most accurate classifier so far best_classifier = None highest_accuracy = 0 for iter_num in range(iterations): # randomly shuffle the feature sets to get new subsets to test and train on random.shuffle(negfeats) random.shuffle(posfeats) negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] # negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # negfeats[negcutoff:] + posfeats[poscutoff:] if DEBUG: print("Train on %d instances, test on %d instances.\n" % (len(trainfeats), len(testfeats))) # train the classifier on the training features and determine its accuracy classifier = NaiveBayesClassifier.train(trainfeats) accuracy = nltk.classify.util.accuracy(classifier, testfeats) if DEBUG: print("\nAccuracy:", accuracy) # if this classifier outperformed all before it, track it and its accuracy if accuracy > highest_accuracy: highest_accuracy = accuracy best_classifier = classifier utils.update_progress(iter_num / iterations, message="Testing Classifiers") sys.stdout.write("\n\n") return (classifier, highest_accuracy)
def get_results(): update_progress('Fetching Object Recognition Results') # Initialize up to concurrent number of threads for i in range(concurrent): t = Thread(target=cloudsight_fetch) threads.append(t) t.start() # Initialize our monitoring thread for progress reporting t = Thread(target=monitor, args=(len(glob('/tmp/*.jpg')), )) threads.append(t) t.start() # Feed filenames into queue to be consumed by threads for filename in glob('/tmp/*.jpg'): resultdict[filename] = { "cloudsight": [], "msft": [], "ibm": [], "google": [] } fail_check() time.sleep(3.1) # Cloudsight Limit 1 req/3 sec cs_q.put(filename) break #!!!!!!! REMOVE FOR FULL PROCESSING. RIGHT NOW ONLY 1 IMAGE WILL BE SENT.!!!!!!!!!!!!! # We now announce to all threads that queue feeding has stopped. # If they see both FLAG = true and the queue is empty, thread will quit FLAG = True # Keep checking until the queue is empty, and check for any errors while True: if cs_q.empty() == False: fail_check() time.sleep(1) else: break # Join all remaining threads from queue if not done so by now. cs_q.join() # Declare job is done to monitor thread so it may quit now JOB_DONE = True return resultdict
def run(self): open(self.case_db_path, 'w').close() forms_per_user = float(settings.CASES_PER_USER * settings.FORMS_PER_CASE) for i, user_id in enumerate(self.user_ids): print('\n\n## Loading data for user {} of {}'.format(i, self.num_users)) synclog_id = self.get_synclog_id(user_id) num_cases_user = 0 num_forms_user = 0 with self.form_loader as loader: while num_forms_user < forms_per_user: create_case = num_cases_user < settings.CASES_PER_USER form = self.get_form(user_id, synclog_id, create_case) loader.put_doc(form) num_forms_user += 1 if create_case: num_cases_user += 1 if num_forms_user % 50 == 0: update_progress('Forms:', num_forms_user / forms_per_user) self.num_forms += num_forms_user self.num_cases += num_cases_user print('') with self.case_loader as loader: case_ids = self.case_forms.keys() num_cases = float(len(case_ids)) for j, case_id in enumerate(case_ids): is_child_case = random.random() < settings.CHILD_CASE_RATIO forms = self.case_forms[case_id] case = self.get_case(user_id, case_id, forms, is_child_case) loader.put_doc(case) cases_created = j + 1 if cases_created % 50 == 0: update_progress('Cases:', cases_created / num_cases) self.save_database_and_clear() self.print_actual()
def main(self, args): #filename = "craw_weather_%s_%s_%s.txt" % (args.city, utils.clear_datetime(args.start), utils.clear_datetime(args.end)) start = datetime.strptime(args.start, pr.fm) if args.end: end = datetime.strptime(args.end, pr.fm) else: end = utils.get_datetime_now() start_point = utils.now_milliseconds() # output = "timestamp,PM10_VAL,PM2.5_VAL,O3(ppm),NO2(ppm),CO(ppm),SO2(ppm),PM10_AQI,PM2.5_AQI\n" output = "" length = (end - start).total_seconds() / 86400.0 save_interval = args.save_interval counter = 0 last_save = 0 if "," in args.city: cities = args.city.split(",") else: cities = [args.city] while start <= end: now = utils.now_milliseconds() diff = now - start_point # print(elapsed_time) if diff >= 100: # try: counter += 1 date = "%s-%s-%s" % (start.year, self.format10( start.month), self.format10(start.day)) for c in cities: html = self.craw_data(c, date) data = self.mine_data(date, html, c) if data: output += "\n".join(data) + "\n" if (counter - last_save) == save_interval: last_save = counter self.write_log(output) output = "" # except Exception as e: # print(start.strftime(pr.fm), e) start = start + timedelta(days=1) start_point = now utils.update_progress(counter * 1.0 / length) self.write_log(output)
def __get_vocabs(self): num_tokens = 0 num_sentences = 0 vocabs = dict with open(self.path, 'r') as f: progress = 0 line = f.readline() while line: num_sentences += 1 if not (num_sentences % 10000): progress = f.tell() * 1.0 / self.size utils.update_progress(progress, "Counting vocabs", 40) tokens = line.rstrip().split() num_tokens += len(tokens) list(map(lambda x: utils.inc_dict_value(vocabs, x), tokens)) line = f.readline() if progress < 1: utils.update_progress(1, "Counting vocabs", 40) return vocabs, num_tokens, num_sentences
def main(limit=len(ALL)): d = dict([a for a in ALL.items()[:limit]]) total = float(limit) out = [] i = 0 record_count = 0 print 'Exporting orders...' ft = open(export_dir+'ORDERS.marc.txt', 'w') fb = open(export_dir+'ORDERS.marc.dat', 'wb') for (recid, record) in d.items(): i = i + 1 rec_binary = format_record(recid) fb.write(rec_binary.as_marc()) ft.write(str(rec_binary) + '\n==================\n') record_count = record_count + 1 if i > limit: break update_progress(i*100/total) fb.close() ft.close() print "\nOrders exported: %d/%d" % (record_count, limit)
def import_cities_to_redis(connection, location_file, lang): key = 'cityid2city:' + lang + ':' with open(location_file) as csvfile: reader = csv.DictReader(csvfile) update_progress("import_cities_to_redis", 0) row_count = sum(1 for row in reader) count = 0 csvfile.seek(0) for row in reader: if count == 0: count = count + 1 continue count = count + 1 city_id = row['geoname_id'] continent_code = row['continent_code'] continent_name = row['continent_name'] country_code = row['country_iso_code'] country_name = row['country_name'] subdivision_code = row['subdivision_1_iso_code'] subdivision_name = row['subdivision_1_name'] city_name = row['city_name'] metro_code = row['metro_code'] if count % 1000 == 0: update_progress("import_cities_to_redis", count / float(row_count)) connection.hset( key, city_id, json.dumps([ continent_code, continent_name, country_code, country_name, subdivision_code, subdivision_name, city_name, metro_code ])) update_progress("import_cities_to_redis", 1)
def execute(self, args): print("start crawling aqi seoul") save_interval = args.save_interval start = datetime.strptime(args.start, pr.fm) # start_point = utils.get_datetime_now() output = "" counter = 0 last_save = 0 # crawler_range = 3600 if not args.forward: if args.end: end = datetime.strptime(args.end, pr.fm) else: end = utils.get_datetime_now() length = (end - start).total_seconds() / 86400 else: end = datetime.strptime("2050-12-31 00:00:00", pr.fm) while start <= end: now = utils.get_datetime_now() # if (now - start_point).total_seconds() >= args.interval: # start_point = now if (now - start).total_seconds() > 3600: hour = start.hour tmp = start if tmp.hour == 0: tmp = tmp - timedelta(hours=1) hour = "24" else: hour = self.format10(tmp.hour) st_ = start.strftime(pr.fm) output, counter, last_save = self.craw_data_controller(output, counter, last_save, save_interval, tmp, hour, st_) # move pointer for timestep start = start + timedelta(hours=1) if not args.forward: utils.update_progress(counter * 1.0 / length) else: self.write_log(output) output = "" self.write_log(output)
def gen_dataset(self, params, dataset_type, seed=0): random.seed(seed) np.random.seed(seed) if dataset_type == 'train': num_seq = params.TRAIN_NUM_SEQ path = params.TRAIN_SET_PATH elif dataset_type == 'val': num_seq = params.VAL_NUM_SEQ path = params.VAL_SET_PATH elif dataset_type == 'test': num_seq = params.TEST_NUM_SEQ path = params.TEST_SET_PATH else: raise ValueError('dataset_type must be train, val, or test') for i in range(num_seq): sdg = SequenceDataGenerator(params.NUM_SHAPE, params.IMG_SIZE, params.SEQUENCE_LEN, params.RANDOM_SIZE, params.ROTATE_SHAPES) utils.update_progress(i / num_seq) seq = sdg.get_sequence() pickle_folder_path = os.path.join(path, f'seq_{i}') utils.mkdir_if_missing(pickle_folder_path) pickle_full_path = os.path.join(pickle_folder_path, 'sequence.pickle') with open(pickle_full_path, 'wb') as handle: pickle.dump(seq, handle) image_count = 0 for info in seq: image = info['image'] image_folder_path = os.path.join(path, f'seq_{i}', 'images') utils.mkdir_if_missing(image_folder_path) image_full_path = os.path.join(image_folder_path, f'{image_count:05d}.png') image = (image * 255).astype(np.uint8) cv2.imwrite(image_full_path, image) image_count += 1
def main(limit=len(ALL)): d = dict([a for a in ALL.items()[:limit]]) total = float(limit) out = [] i = 1 record_count = 0 #erase the log file with codecs.open(log_dir + 'log_bib.txt', 'w', encoding='utf8') as f: f.close() with codecs.open(log_dir + 'authors_split.txt', 'w', encoding='utf8') as f: f.close() w = codecs.open(export_dir+'BIBLIOGRAPHIC.marc.txt', 'w') UTF8Writer = codecs.getwriter('utf8') f = UTF8Writer(w) # f = codecs.open(export_dir+'BIBLIOGRAPHIC.marc.txt', 'w', encoding='utf8') fb = open(export_dir+'BIBLIOGRAPHIC.marc.dat', 'wb') print 'Exporting bibliographic records...' for (recid, record) in d.items(): # print recid # out.append(format_record(recid)) if record['TI'].find('eReader') == -1: # if not 'HIDE' in record: rec_binary = format_record(recid) # print recid # print rec_binary f.write(str(rec_binary) + u'\n==================\n') fb.write(rec_binary.as_marc()) record_count = record_count + 1 update_progress(i*100/total) i = i + 1 if i > limit: break print "\ntotal records: \t" + str(i-2) print "exported records:\t" + str(record_count) f.close() fb.close()
def generate_subimages(hulls, img, h, v, folder='/tmp/'): utils.update_progress('Extracting Regions') fid = 0 pathlist = [] coords = edge_coordinates(hulls) if len(coords) == 0: cv2.imwrite('/tmp/image.jpg', img) pathlist.append(('/tmp/image.jpg', (0, h, 0, v))) return pathlist else: # p_c = padded coordinates. c = coordinates. for p_c, c in process_coords(coords, h, v): p_x1, p_y1, p_x2, p_y2 = p_c x1, y1, x2, y2 = c roi = img[p_y1:p_y2, p_x1:p_x2] fid = fid + 1 path = folder + str(fid) + '.jpg' pathlist.append( (path, p_c, c, (x1 + ((x2 - x1) // 2), y1 + ((y2 - y1) // 2)))) # Write subimage to file cv2.imwrite(path, roi) return pathlist
def _read_lexicon_to_memory(cls, file_location): print "\nReading lexicon to memory..." lexicon = codecs.open(file_location, 'rb', 'utf-8') lexicon_list = lexicon.readlines() if ENV.PROGRESS_BAR == True: util.update_progress(0) for idx, entry in enumerate(lexicon_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(lexicon_list))) entry = entry.replace('\n', '').split(' ') entry[0] = int(entry[0]) entry[2] = int(entry[2]) lexicon_list[idx] = entry if ENV.PROGRESS_BAR == True: util.update_progress(1) lexicon.close() return lexicon_list
def _read_full_postings_to_memory(cls, file_location): print "\nReading posting list to memory..." postings = codecs.open(file_location, 'rb', 'utf-8') posting_list = {} posting_lines = postings.readlines() if ENV.PROGRESS_BAR == True: util.update_progress(0) for idx, line in enumerate(posting_lines): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(posting_lines))) line = line.replace('\n', '').split(': ') doc_info = line[1].split('->') for idx, doc in enumerate(doc_info): doc = re.sub(r'[\(\)]', '', doc) doc = doc.split(', ') doc_info[idx] = [int(doc[0]), int(doc[1])] posting_list[int(line[0])] = doc_info if ENV.PROGRESS_BAR == True: util.update_progress(1) postings.close() return posting_list
def _read_doc_list_to_memory(cls, file_location): print "\nExtracting Document List..." doc_lengths = [] documents = codecs.open(file_location, 'rb', 'utf-8') document_list = documents.readlines() doc_dict = {} if ENV.PROGRESS_BAR == True: util.update_progress(0) for idx, entry in enumerate(document_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(document_list))) entry = entry.replace('\n', '').split(' ') entry[0] = int(entry[0]) entry[1] = int(entry[1]) document_list[idx] = entry doc_dict[entry[0]] = {'length': entry[1]} doc_lengths.append(entry[1]) if ENV.PROGRESS_BAR == True: util.update_progress(1) cls.avg_doc_length = np.mean(doc_lengths) cls.collection_length = np.sum(doc_lengths) return doc_dict
posts = get_posts(start_date, end_date) # dictionary containing candidates mapped to lists of sentiment values for that candidate sentiments = dict() totals = dict() overall_total = 0 num_candidates = len(list(posts.keys())) current = 1 for candidate in posts: sentiments[candidate] = [] totals[candidate] = 0 for score, text in posts[candidate]: sentiments[candidate].append(classify(classifier, text, score)) totals[candidate] += 1 overall_total += 1 utils.update_progress(current / num_candidates, message=candidate) current += 1 # clear the progress bars for the candidates sys.stdout.write("\r" + " " * 70 + "\n") # normalize the values to 0 lowest = 0 for candidate in sentiments: avg = sum(sentiments[candidate]) / overall_total if avg < lowest: lowest = avg # display sentiment values for each candidate to the console print("\nRelative Sentiment Values:") print("(normalized to 0, higher is more positive)\n") for candidate in sentiments:
def _extract_document_summations(cls): print "\nExtracting document tf-idf summations for use in Vector Space Cosine..." if ENV.PROGRESS_BAR == True: util.update_progress(0) # for every term in our posting list for idx, term in enumerate(cls.posting_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(cls.posting_list))) docs = cls.posting_list[term] # run through the documents for each term and add the additional tfidf to an accumulation in the dict for doc in docs: tfidf_addition = qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys())) tfidf_addition_squared = np.square(tfidf_addition) if 'tf_idf_sum' in cls.doc_list[doc[0]]: cls.doc_list[doc[0]]['tf_idf_sum'] += tfidf_addition_squared else: cls.doc_list[doc[0]]['tf_idf_sum'] = tfidf_addition_squared if ENV.PROGRESS_BAR == True: util.update_progress(1) print "\nExtracting document weight summations for use in Vector Space Cosine..." if ENV.PROGRESS_BAR == True: util.update_progress(0) # Again, we run through each term in our posting list for idx, term in enumerate(cls.posting_list): if ENV.PROGRESS_BAR == True: util.update_progress(float(idx) / float(len(cls.posting_list))) docs = cls.posting_list[term] # each doc within each term has the VS weight calculated for the terms to find a summation for doc in docs: weight_addition = float(qp.calculate_tf_idf(doc[1], cls.get_df_by_term_id(term), len(cls.doc_list.keys()))) / float(cls.doc_list[doc[0]]['tf_idf_sum']) weight_addition_squared = np.square(weight_addition) if 'sum_weight' in cls.doc_list[doc[0]]: cls.doc_list[doc[0]]['sum_weight'] += weight_addition_squared else: cls.doc_list[doc[0]]['sum_weight'] = weight_addition_squared if ENV.PROGRESS_BAR == True: util.update_progress(1)
shell=True ) duration = float(output) if prune_min or prune_max: duration_fit = True if prune_min: if duration < args.min_duration: duration_fit = False if prune_max: if duration > args.max_duration: duration_fit = False if duration_fit: new_files.append((files[x], duration)) else: new_files.append((files[x], duration)) update_progress(x / float(size)) print("\nSorting files by length...") def func(element): return element[1] new_files.sort(key=func) print("Saving new manifest...") with io.FileIO(args.output_path, 'w') as f: for file_path in new_files: sample = file_path[0].strip() + '\n'