def run(): start_year = args().start_year LOGGER.info("checking total page count") url = '%s%s' % ( BASE_URL, args().title, ) r = make_req(url) r = make_req("%s?searchform.when.from=%s-01-01&a=search" % (r.url, start_year)) bs = beautify(r.text) if args().page_range: start, end = args().page_range.split('-') start, end = int(start), int(end) else: start, end = find_page_count(bs) if all([start, end]): import multiprocessing from time import sleep from progressbar import ProgressBar, Bar, FormatLabel, RotatingMarker LOGGER.info( "generating urls between pages %d & %d for [%d - present]" % (start, end, start_year)) urls = generate_urls(r.url, start, end) LOGGER.info("scraping dates from entries...") pool = multiprocessing.Pool(processes=8) results = pool.map_async(walk_page, urls, callback=squash_results) pool.close() remaining = results._number_left progress_bar = ProgressBar(widgets=[ FormatLabel('-> '), RotatingMarker(), FormatLabel(' '), Bar() ], maxval=remaining).start() while True: if results.ready(): break progress_bar.update(remaining - results._number_left) sleep(.05) progress_bar.finish() if results.ready(): import itertools from sourgraph.graphs import make_graph title = args().title.lower() sorted_result_list = sorted(list(itertools.chain(*crawl_results))) LOGGER.info("generating graph...") top_date = make_graph(sorted_result_list, title=title, start_year=args().start_year, trim=args().trim) LOGGER.info("graph saved") if top_date and args().with_news: from sourgraph.web.hurriyet import return_news_url LOGGER.info("checking news...") news_url = return_news_url(top_date, title) if news_url: LOGGER.info("news url: %s" % news_url) import webbrowser LOGGER.info("opening url...") webbrowser.open(news_url, new=2) else: LOGGER.info("couldn't find any news for '%s'" % title) LOGGER.info("bye!")
import atexit client = MongoClient() db = client.dotabot matches = db.matches # Dataset manipulation if isfile(TRAIN_FILE_NAME) and isfile(VALIDATION_FILE_NAME) and isfile(TEST_FILE_NAME): test_ds = SupervisedDataSet.loadFromFile(TEST_FILE_NAME) valid_ds = SupervisedDataSet.loadFromFile(VALIDATION_FILE_NAME) train_ds = SupervisedDataSet.loadFromFile(TRAIN_FILE_NAME) print "Training, validation and test datasets loaded" else: ds = SupervisedDataSet(NUM_FEATURES, 1) widgets = [FormatLabel('Processed: %(value)d/%(max)d matches. '), ETA(), ' ', Percentage(), ' ', Bar()] pbar = ProgressBar(widgets = widgets, maxval = NUM_MATCHES).start() seen = set() r, d = 0, 0 for i, record in enumerate(matches.find()): if record['match_id'] in seen: # print "Ignore redundant match {0}".format(record['match_id']) continue if not is_valid_match(record): # print "Ignore invalid match {0}".format(record['match_id']) continue seen.add(record['match_id']) y = 1.0 if record['radiant_win'] else 0.0 if record['radiant_win']:
import os from progressbar import FormatLabel, Bar CONFIG_PATH = os.path.expanduser( os.path.join(os.getenv("XDG_CONFIG_HOME", "~/.config"), "gphoto_backup")) PBAR_WIDGETS = [FormatLabel("|%(value)d/%(max)d Albums"), Bar()] def clear_progressbar(pbar): pbar.fd.write('\r' + (' ' * pbar.term_width) + '\r') def mkdir_if_needed(path): try: os.mkdir(path) except OSError: pass
# we create a LineString for each Way, and we append it to the route ... (1) for node in nodes.nodes: way.append((node.lon, node.lat)) routes[routename].append(LineString(way)) for route in routes: # (1) ... and then we merge it in a single line here; it's important to note that # linemerge() returns a LineString or MultiLineString when lines are not contiguous routes[route] = linemerge(routes[route]) print('%d routes found' % len(routes), flush=True) results = [] pbar = ProgressBar( widgets=[FormatLabel('Routes processed: %(value)d of %(max)d - '), ETA()], maxval=len(routes)).start() for i, route in enumerate(sorted(routes)): # see above node on linemerge(), we handle the case of MultiLineString by forcing # lines to be always a list, eventually made by a single item if type(routes[route]) == LineString: lines = [ routes[route], ] else: lines = routes[route] gmap = gmplot.GoogleMapPlotter(center_lng=lines[0].centroid.x, center_lat=lines[0].centroid.y,
def zonaprop(driver, parametros, log, inputfile=None, tmpdir=os.getcwd(), inputparam=None): def _get_element(xpath): try: return driver.find_element_by_xpath(xpath).text except Exception: return "" def _get_dat_from_ul(xpath): try: d = { "ambientes": None, "baños": None, "antiguedad": None, "superficie": None, "superficie_cubierta": None, "cochera": "No", "toilette": "No", } ul = driver.find_element_by_xpath(xpath) for text in [el.text for el in ul.find_elements_by_tag_name("li")]: if "Ambientes" in text: d["ambientes"] = text.split(" ")[0] if "Baños" in text: d["baños"] = text.split(" ")[0] if "Antigüedad" in text: d["antiguedad"] = text.split(" ")[0] if "Total" in text: d["superficie"] = text.split(" ")[0] if "Cubierta" in text: d["superficie_cubierta"] = text.split(" ")[0] if "Cochera" in text: d["cochera"] = "Si" if "Toilette" in text: d["toilette"] = "Si" return d except Exception: return [] def _get_propiedad_data(url_propiedad): driver.get(url_propiedad) precio = WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, parametros["precio_xpath"])) ) tipo_op = _get_element(parametros["price_type"]) expensas = "'" + _get_element(parametros["expensas_xpath"]) decripcion = _get_element(parametros["descripcion_xpath"]) datos = _get_dat_from_ul(parametros["datos_ul_xpath"]) mts_totales = datos["superficie"] mts_cubiertos = datos["superficie_cubierta"] ambientes = datos["ambientes"] baños = datos["baños"] cochera = datos["cochera"] toilette = datos["toilette"] antiguedad = datos["antiguedad"] direccion = _get_element(parametros["dir_xpath"]) publicado = _get_element(parametros["publicado_xpath"]) inmobiliaria = _get_element(parametros["inmobiliaria_xpath"]) return ( decripcion, direccion.replace('\r', '').replace('\n', '').replace('Ver en mapa', ''), precio.text, expensas, mts_totales, mts_cubiertos, ambientes, baños, toilette, cochera, antiguedad, publicado, inmobiliaria, url_propiedad, ) datos = [('Detalle', 'Dirección', 'Precio', 'Expensas', 'Mts Totales', 'Mts Cubiertos', 'Ambientes', 'Baños', 'Toilette', 'Cochera', 'Antiguedad', 'Publicado', 'Inmobiliaria', 'URL')] urls = list() if inputparam is not None: urls = [inputparam] else: with open(inputfile, "r") as f: urls = f.readlines() widgets = [FormatLabel(''), ' ', Percentage(), ' ', Bar('#'), ' ', ETA(), ' ', RotatingMarker()] bar = ProgressBar(widgets=widgets, maxval=len(urls)) i = 1 for url_propiedad in urls: url_propiedad = url_propiedad.strip() try: # a = 1/0 datos.append(_get_propiedad_data(url_propiedad)) except Exception as err: vacio = list("" for _ in range(len(datos[0]))) vacio[-1] = "!!!Error: {0}".format(err) datos.append(tuple(vacio)) widgets[0] = FormatLabel('[Prop: {0}]'.format(url_propiedad)) bar.update(i) i = i + 1 bar.finish() driver.quit() return datos
busroutes = sorted(glob.glob('tfl_bus_routes/*.json')) # we load the JSONs only once print('Caching routes JSONs...') routes = {} for route in busroutes: with open(route) as f: routes[route] = json.load(f) pairs = [x for x in combinations(busroutes, 2)] results = [] pbar = ProgressBar( widgets=[FormatLabel('Pairs processed: %(value)d of %(max)d - '), ETA()], maxval=len(pairs)).start() for i, (route1, route2) in enumerate(pairs): r1 = routes[route1] r2 = routes[route2] # dont process the same line if r1['lineName'] == r2['lineName']: continue # we first check the end of r1 and the start of r2 r = three_stops_distance(DISTANCE, r1['stopPointSequences'][0]['stopPoint'][-3:], r2['stopPointSequences'][0]['stopPoint'][:3])
def main(): catalog = {} curr_data_date = None # Add some more to prevent error when new stocks found total = _total_stocks() + 10 widgets = [ FormatLabel( 'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total)) ] pbar = ProgressBar(widgets=widgets, maxval=total) count = 0 pbar.start() state = common.load_state() for catalog_key, url in CATELOG.items(): data_date, result = get_category_stock_info(url) if not result: raise Exception('Empty parsing result, key: {}, url: {}'.foramt( catalog_key, url)) if curr_data_date is None: curr_data_date = data_date elif curr_data_date != data_date: msg = 'Data date is not the same!'\ ' curr_data_date: %s, data_date: %s, url: %s'\ % (curr_data_date, data_date, url) common.report_error(msg) raise Exception(msg) stype, category = catalog_key for stock_no, data in result.items(): stock_data = common.load_stock(stock_no) daily_report = stock_data.setdefault(common.DAILY, {}) meta = stock_data.setdefault(common.META, {}) daily_report[data_date] = data category_key = SEPARATOR.join(catalog_key) meta.update({ common.META_STOCK_NO: stock_no, common.META_COMPANY_TYPE: stype, common.META_COMPANY_CATEGORY: category, common.META_CATEGORY_KEY: category_key, common.META_NAME: data.pop('name'), common.META_DAYS: sorted(daily_report.keys(), reverse=True), }) stock_data.setdefault(common.META, {}).update(meta) common.save_stock(stock_no, stock_data) catalog.setdefault(category_key, []).append(stock_no) pbar.update(count) count += 1 if not catalog.setdefault(SEPARATOR.join(catalog_key), []): common.report_error('NO STOCK FOUND!!!! %s, %s' % (catalog_key, url)) common.save_catalog(catalog) state[common.CURRENT_DATA_DATE] = curr_data_date common.save_state(state) pbar.finish()
def transfer(read_from, save_to): click.echo('%s --> %s' % (read_from, save_to)) if read_from not in OPTIONS or save_to not in OPTIONS: print 'Should be %s or %s' % (LOCAL, FIREBASE) sys.exit(-1) if read_from == save_to: print 'Saving data to where it is from does not make sense.' sys.exit(-2) click.echo('This will OVERWRITE data in "%s". Are you sure? [y/N]' % save_to) confirm = sys.stdin.readline() if confirm.strip() != 'y': print 'byebye~' return common.READ_FROM = common.LOCAL if read_from == LOCAL else common.FIREBASE common.SAVE_TO = (common.LOCAL,)\ if save_to == LOCAL else (common.FIREBASE,) print 'Transfering catalog...' catalog = common.load_catalog() common.save_catalog(catalog) print 'Transfering categories...' catalog = common.load_catalog() categories = common.load_categories() common.save_categories(categories) print 'Transfering filter results...' f_results = common.load_filter_results() common.save_filter_results(f_results) print 'Transfering indicator results...' i_results = common.load_indicator_results() common.save_indicator_results(i_results) print 'Transfering config...' config = common.load_config() common.save_config(config) todo = [] for stocks in catalog.values(): todo.extend(stocks) total = len(todo) print 'Transfering sotcks...' widgets = [ FormatLabel( 'Processed: %(value)d / {0} (in: %(elapsed)s)'.format(total)) ] pbar = ProgressBar(widgets=widgets, maxval=total) count = 0 pbar.start() for s in todo: data = common.load_stock(s) common.save_stock(s, data) pbar.update(count) count += 1 pbar.finish() print 'Transfering state...' catalog = common.load_catalog() state = common.load_state() common.save_state(state)
perf_nans = np.isnan(perf_array) if (1 - perf_nans).sum() == 0: raise Exception('The selected metric evaluations are all nans') best_perf_expes = perf_array[perf_nans == False] # NOQA bool_choice = op(best_perf_expes) == np.array(best_perf_expes) best = ar_expes[bool_choice] # NOQA best_key = ar_keys[bool_choice] return best[0], best_key[0] widgets = [ Percentage(), ' ', SimpleProgress(), ' ', Bar(marker='=', left='[', right=']'), ' ', FormatLabel('in: %(elapsed)s'), ' ', ETA(), ' | ', 'job/', DynamicMessage('s') ] class Ensemble(object): """Base class to build experiments containers able to execute batch sequences of action. Must implement the `fit`, `fit_gen`, `fit_async` `fit_gen_async` methods Args: experiments(dict or list): experiments to be wrapped. If a dictionnary is passed, it should map experiment names to experiments. """ def __init__(self, experiments):
def NodeDic(results, edge_info, node_info): ''' Function takes the results of running a query, NETS edge label information, and a list of node information (list[0] contains the NETS nodes label triples, list[1] contains the contains the NETS nodes identifier triples). The function returns a list of dictionaries where list[0] contains a nested dictionary where keys are bio entity identifiers and the values are the the human readable labels and database identifiers; list[1] contains a dictionary where the bio node is the key and the value is a set of possible NETS node types for that node. :param results: json file containing the query results from endpoint :param edge_info: dictionary where the keys are the NETS edges and the values are the edge labels :param node_info: a list of node information (list[0] contains the NETS nodes label triples, list[1] contains the contains the NETS nodes identifier triples) :return: a list of dictionaries: list[0] contains a nested dictionary where keys are bio entity identifiers and the values are the the human readable labels and database identifiers; list[1] contains a dictionary where the bio node is the key and the value is a set of possible NETS node types for that node ''' print 'Start building OWL-NETs metadata dictionary' # creates a map to store NETS node type information node_type = {} # creates a map to identify which query variables represent the BIO world ID, label, and ICE ID node_labeler = {} # assign variables needed for node dictionary NETS = set([x.strip('?') for y in edge_info[0].keys() for x in y]) labels = [[re.sub('[?|"\n"]', '', x.split(' ')[0]), re.sub('[?|"\n"]', '', x.split(' ')[2])] for x in node_info[0]] ids = [[x.split(' ')[0].strip('?'), x.split(' ')[2].strip('?')] for x in node_info[1]] # initialize progress bar progress bar widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] pbar = ProgressBar(widgets=widgets, maxval=len(NETS)) for node in pbar(NETS): node_labeler[node] = {} for res in results['results']['bindings']: node_key = str(res[node]['value']) label_value = str([x[1] for x in labels if x[0] == node][0].encode('utf8')) id_value = str([x[0] for x in ids if x[1] == node][0].encode('utf8')) # NODE TYPE: setting node type information if node_key in node_type.keys(): node_type[node_key].add(node) else: node_type[node_key] = set() node_type[node_key].add(node) # NODE METADATA: setting node attributes by NETS node type if node_key in node_labeler[node].keys(): # order matters - not using a set so that each ICE can be mapped to the label with the same index node_labeler[node][node_key]['label'].append(res[label_value]['value'].encode('utf8')) node_labeler[node][node_key]['id'].append(res[id_value]['value'].encode('utf8')) else: node_labeler[node][node_key] = {} node_labeler[node][node_key]['label'] = [res[label_value]['value'].encode('utf8')] node_labeler[node][node_key]['id'] = [res[id_value]['value'].encode('utf8')] # close progress bar pbar.finish() print 'Finished building OWL-NETs metadata dictionary' print '\n' # CHECK: verify that the counts are correct for node in NETS: res_count = set() for res in results['results']['bindings']: res_count.add(res[node]['value']) if len(node_labeler[node].keys()) != len(res_count): # verify the number of nodes in graph is correct raise ValueError('The count of results for the ' + str(node) + ' NETS node in the node dictionary differ ' 'from the query output') return node_labeler, node_type
def NETSGraph(results, NETS_edges, node_labeler, node_type, edge_labeler): ''' Function takes a json file of query results, a list of NETS edges, node and edge metadata dictionaries, and a dictionary containing NETS edge information by BIO node. Using these items the function creates the directed OWL-NETS abstraction network. Node metadata includes: labels (a list of human readable labels); id (the endpoint database identifiers); and bio (the NETS node type). Edge metadata includes: labels (human readable label for the edge between two NETS nodes) and id (the ontology concept term used to link the NETS nodes). :param results: json file containing the query results from endpoint :param NETS_edges: list of lists, where each list is a NETS edge and the order specifies a directional relationship :param node_labeler: node metadata nested lists (list[0] contains the NETS nodes label triples, list[1] contains the contains the NETS nodes identifier triples) :param node_type: dictionary with BIO node as key and set of NETS node types as value :param edge_labeler: dictionary where the keys are the NETS edges and the values are the edge labels :return: OWL-NETS directed graph ''' print 'Started building OWL-NETS graph' # initialize progress bar progress bar widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')] pbar = ProgressBar(widgets=widgets, maxval=len(results['results']['bindings'])) NETS_graph = nx.DiGraph() for res in pbar(results['results']['bindings']): for edge in NETS_edges: i = res[str(edge[0].strip('?').encode('utf8'))]['value'].encode('utf8') j = res[str(edge[1].strip('?').encode('utf8'))]['value'].encode('utf8') # set nodes NETS_graph.add_node(min(node_labeler[edge[0].strip('?')][i]['label'], key=len), labels=node_labeler[edge[0].strip('?')][i]['label'], id=node_labeler[edge[0].strip('?')][i]['id'], bio=i, type='-'.join(list(node_type[i]))) # gets second node in edge NETS_graph.add_node(min(node_labeler[edge[1].strip('?')][j]['label'], key=len), labels=node_labeler[edge[1].strip('?')][j]['label'], id=node_labeler[edge[1].strip('?')][j]['id'], bio=j, type='-'.join(list(node_type[j]))) # add edge NETS_graph.add_edge(min(node_labeler[edge[0].strip('?')][i]['label'], key=len), min(node_labeler[edge[1].strip('?')][j]['label'], key=len), labels=res[(edge_labeler[tuple(edge)]['label']).strip('?')]['value'].encode('utf8'), id=(edge_labeler[tuple(edge)]['id']).strip('?'), edge='-'.join([edge[0].strip('?'), edge[1].strip('?')])) # closes first progress bar pbar.finish() print 'Finished building OWL-NETS graph' print '\n' # print information about graph print 'Directed OWL-NETS Graph has ' + str(len(NETS_graph.nodes())) + ' nodes, ' + str( len(NETS_graph.edges())) + ' edges, and ' + str( nx.number_connected_components(NETS_graph.to_undirected())) + ' connected component(s)' return NETS_graph
g_optimizer = torch.optim.Adam(g_net.parameters(), lr=arg.lr, betas=(0.5, 0.999)) d_optimizer = torch.optim.Adam(d_net.parameters(), lr=arg.lr, betas=(0.5, 0.999)) log_file = open(arg.log_file, 'w') for epoch in range(1, arg.epochs + 1): print('Epoch: {}/{}'.format(epoch, arg.epochs)) g_total_loss, d_total_loss = 0, 0 widgets = [ FormatLabel(''), ' ', Bar('=', '[', ']'), ' - ', ETA(), ' ', FormatLabel('') ] pbar = ProgressBar(widgets=widgets, maxval=x_train.shape[0]) pbar.start() for i, (real_img, real_tag) in enumerate(train_loader): for p in d_net.parameters(): p.requires_grad = True noise = Variable(torch.randn(real_img.size()[0], arg.noise_dim), volatile=True).cuda() wrong_tag = Variable(get_wrong_tag(real_tag)).cuda() real_img = Variable(real_img).cuda()
def generate_bar(ln, text): return pb(min_value=0, max_value=ln, widgets=[FormatLabel(text), ] + base_widgets)
def finish(self, total): msg = '[Patching {0} ASGs]: {0} Complete'.format(total) self.widgets[4] = FormatLabel(msg) self.progress.finish()
for b in meta.active_branches(): chain.SetBranchStatus(b, 1) chain.SetBranchStatus('run', 1) chain.SetBranchStatus('lumi', 1) chain.SetBranchStatus('evt', 1) for b in args.branches: chain.SetBranchStatus(b, 1) except: raise log.warning("Couldn't get meta tree - will not disable branches") passed_events = [] nrows = chain.GetEntries() pbar = ProgressBar(widgets=[ FormatLabel('Processed %(value)i/' + str(nrows) + ' rows. '), ETA(), Bar('>') ], maxval=nrows).start() pbar.update(0) for row in xrange(nrows): pbar.update(row) chain.GetEntry(row) all_passed = True for name, selection in selections: passed = selection(chain) if not passed: all_passed = False break
def process_rocstories(stories): """ - randomly select one missing sentence - randomly select accepted words: a list of (position, word) - randomly select keywords (excluding accepted words): a unordered list of words :param stories: :return: """ story_size = len(stories) ''' Get missing sent indexes ''' missing_sent_indexes = [random.randint(1, 3) for _ in range(story_size)] hist, bins = np.histogram(missing_sent_indexes, bins=3) print("-" * 80) print("Histogram of missing sent indexes") print(" ".join(["%5d" % b for b in bins[1:]])) print(" ".join(["%5d" % h for h in hist])) print("-" * 80) widgets = [ FormatLabel('Processed: %(value)d stories (in: %(elapsed)s)'), Percentage(), " | ", SimpleProgress(), " | ", Bar() ] pbar = ProgressBar(widgets=widgets) stories_processed = {l: [] for l in range(4, 14)} for i in pbar(range(len(stories))): story, missing_idx = stories[i], missing_sent_indexes[i] entity = ROCStoriesEntity(story, missing_idx) l = entity.missing_sent_len_np if l < 5: l = 4 if l > 12: l = 13 stories_processed[l].append(entity) a_lens = {} k_lens = {} for l, bucket in stories_processed.items(): random.seed(l) a_lens[l] = [] for entity in bucket: # select accepted words accepted_len = random.randint(0, l - KEYWORD_MIN - 1) # # select keywords # keywords_len = l+1 # while accepted_len + keywords_len > l: # keywords_len = random.randint(0, l-1) # a_lens[l].append(accepted_len) # k_lens[l].append(keywords_len) # assert (accepted_len + keywords_len) <= l, "Missing sent ACPT/KEY selection: Something went wrong: [accepted:%d][keywords:%d][sent_len:%d]"%(accepted_len, keywords_len, l) entity.accepted_words = [ (i, tok.lower()) for i, tok in zip(entity.missing_sent_tokens_randomized_np_idx, entity.missing_sent_tokens_randomized_np) ][:accepted_len] if accepted_len > 0 else [] # entity.keywords = [tok for tok in entity.missing_sent_tokens_randomized_np[accepted_len:keywords_len]] if keywords_len > 0 else [] a_lens[l].append(len(entity.accepted_words)) for l, bucket in stories_processed.items(): random.seed(l) k_lens[l] = [] entities_to_remove = [] for entity in bucket: # select accepted words # accepted_len = random.randint(0, l-1) accepted_len = len(entity.accepted_words) # select keywords keywords_len = l + 1 while accepted_len + keywords_len > l: keywords_len = random.randint(KEYWORD_MIN, l - accepted_len) # A little nudge to push the distribution to the right if accepted_len + keywords_len < l: if random.random() > 0.5: keywords_len += 1 assert ( accepted_len + keywords_len ) <= l, "Missing sent ACPT/KEY selection: Something went wrong: [accepted:%d][keywords:%d][sent_len:%d]" % ( accepted_len, keywords_len, l) entity.keywords = [ tok for tok in entity. missing_sent_tokens_randomized_np[accepted_len:accepted_len + keywords_len] ] if keywords_len > 0 else [] if len(entity.keywords) == 0: print("=> keywords_len: %d produced 0 keywords" % (keywords_len)) print("\t[awl: %d], src_token_len: %d (%s)" % (len(entity.accepted_words), len(entity.missing_sent_tokens_randomized_np), entity.title)) entities_to_remove.append(entity) continue k_lens[l].append(len(entity.keywords)) for e in entities_to_remove: stories_processed[l].remove(e) print("Removing: %s" % (e.title)) for l in a_lens: a_bucket = a_lens[l] k_bucket = k_lens[l] # print_histogram(l, a_bucket, "Accepted words") # print_histogram(l, k_bucket, "Keywords") print_histogram2(l, a_bucket, k_bucket, "Accepted words", "Keywords") return stories_processed
def update_msg(msg): widgets[0] = FormatLabel(f"[{i:4d}/{n_files}] {msg}")
def main(): """ Entry point """ working_directory = getcwd() parser = ArgumentParser(description='') parser.add_argument('-i', '--input_folder', dest='input', metavar='INPUT_DIRECTORY', required=False, default=working_directory, help='Source directory for files renaming. ' 'Current directory by default') args = parser.parse_args() files = [ join(args.input, file) for file in listdir(args.input) if isfile(join(args.input, file)) ] images_files = [file for file in files if is_image(file)] video_files = [file for file in files if is_video(file)] total_files = len(images_files) + len(video_files) widgets = [ FormatLabel('Extracting info'), ' ', Percentage(), ' ', Bar(), ' ', ETA() ] progress_bar = ProgressBar(maxval=total_files, redirect_stdout=True, widgets=widgets) progress_bar.start() images_info_map = {} file_counter = 0 for file in images_files: file_counter += 1 progress_bar.update(file_counter) images_info_map[file] = exif_time_else_creation_time(file) video_files_map = {} for file in video_files: file_counter += 1 progress_bar.update(file_counter) video_files_map[file] = creation_time(file) progress_bar.finish() image_renamings = calculate_renamings(images_info_map) video_renamings = calculate_renamings(video_files_map) image_renamings = dump_renamings(image_renamings) video_renamings = dump_renamings(video_renamings) if ask_yesno(msg='Confirm renaming', dft='y'): rename_files(image_renamings, label='Renaming image files') rename_files(video_renamings, label='Renaming video files')
preprocessed = np.load('train_51022.npz') X = preprocessed['X'] Y = preprocessed['Y'] NUM_MATCHES = 20000 X = X[0:NUM_MATCHES] Y = Y[0:NUM_MATCHES] print 'Training using data from %d matches...' % NUM_MATCHES k_fold = cross_validation.KFold(n=NUM_MATCHES, n_folds=K, indices=True) d_tries = [3, 4, 5] widgets = [ FormatLabel('Processed: %(value)d/%(max)d folds. '), ETA(), Percentage(), ' ', Bar() ] pbar = ProgressBar(widgets=widgets, maxval=(len(d_tries) * K)).start() d_accuracy_pairs = [] for d_index, d in enumerate(d_tries): model = KNeighborsClassifier(n_neighbors=NUM_MATCHES / K, metric=my_distance, weights=poly_param(d)) model_accuracies = cross_validation.cross_val_score(model, X, Y, scoring=score,
class ScrapeJam: # Here's your chance, do your dance, at the ScrapeJam widgets = [ Percentage(), Bar(), FormatLabel(' %(value)d/%(max)d '), ETA(), FormatLabel(' (%(elapsed)s)') ] def __init__(self, filepath, errorlog=None): self.file = filepath self.log = errorlog self.win = curses.initscr() curses.start_color() curses.curs_set(0) curses.noecho() curses.cbreak() self.refresh() def __del__(self): pass def write(self, file, data): f = open(file, 'w') json.dump(data, f, encoding='utf-8') f.close() def refresh(self, clear=True): if clear: self.win.clrtobot() self.win.refresh() def move(self, y, x): self.win.move(y, x) self.refresh(False) # TODO: fill in UUIDs def scrape(self, artists, album_fn, song_fn, lyric_fn, errorlog=None): """Fetches rows from a Bigtable. Args: artists List of tuples (artist_name, artist_url) album_fn Func(artist_tuple): returns list of tuples (album_name, album_url) song_fn Func(artist_tuple, album_tuple): returns list of tuples (song_name, song_url) lyric_fn Func(artist_tuple, album_tuple, song_tuple): returns lyrics or None """ def errorwrap(fn): def wrapped(*args, **kwargs): try: return fn(*args, **kwargs) except Exception as e: error(e) return [ ] # In the case that error() just logged and continued return wrapped def error(e): if self.log: # Log the error and continue on your merry way (or explode w/o log file) self.errorlist.append( {artist[0]: [song[1], traceback.format_exc()]}) else: raise e album_fn = errorwrap(album_fn) song_fn = errorwrap(song_fn) lyric_fn = errorwrap(lyric_fn) i = [0, 0, 0] # Incrementors for artists, albums, and songs done = [0, 0, 0] # Counter for completed scrapes data = {} self.errorlist = [] try: self.artists_pbar = ProgressBar(widgets=[' Artists:'] + self.widgets, maxval=len(artists)).start() for artist in artists: albums = album_fn(artist) if len(albums) == 0: continue i[SJ_ALBUM] = 0 data[artist[0]] = { 'albums': {}, 'uuid': None, 'url': artist[1] } self.albums_pbar = ProgressBar(widgets=[' Albums: '] + self.widgets, maxval=len(albums)).start() for album in albums: songs = song_fn(artist, album) if len(songs) == 0: continue i[SJ_SONG] = 0 data[artist[0]]['albums'][album[0]] = { 'songs': {}, 'uuid': None, 'url': album[1] } self.songs_pbar = ProgressBar(widgets=[' Songs: '] + self.widgets, maxval=len(songs)).start() for song in songs: # lyrics = lyric_fn(artist, album, song) lyrics = 'xxx' if len(lyrics) == 0: continue data[artist[0]]['albums'][album[0]]['songs'][ song[0]] = { 'lyrics': lyrics, 'uuid': None, 'url': song[1] } i[SJ_SONG] += 1 done[SJ_SONG] += 1 self.drawProgress((artist[0], album[0], song[0]), i, done) i[SJ_ALBUM] += 1 # Completed an album done[SJ_ALBUM] += 1 i[SJ_ARTIST] += 1 # Completed an artist done[SJ_ARTIST] += 1 htmlCache = {} # Reset HTML cache after each artist except Exception: curses.nocbreak() curses.echo() curses.endwin() traceback.print_exc() print "Ended on (%s) (%s) (%s)" % (artist[0], album[0], song[0]) except KeyboardInterrupt: pass finally: # Must be run to restore terminal's state to normal curses.nocbreak() curses.echo() curses.endwin() self.write(self.file, data) if self.log and len(self.errorlist) != 0: self.write(self.log, self.errorlist) def drawProgress(self, names, values, done): self.move(0, 0) self.artists_pbar.update(values[0]) self.move(1, 0) self.albums_pbar.update(values[1]) self.move(2, 0) self.songs_pbar.update(values[2]) self.win.addstr(4, 0, " Artist: " + names[0].encode('utf8')) self.refresh() self.win.addstr( 5, 0, " Album: " + (names[1] if names[1] else "N/A").encode('utf8')) self.refresh() self.win.addstr(6, 0, " Song: " + names[2].encode('utf8')) self.refresh() self.win.addstr(8, 0, " COMPLETED") self.win.addstr(9, 0, " Artists: %d Albums: %d Songs: %d" % tuple(done)) if self.log: self.win.addstr(10, 0, " Fatal errors: %d" % len(self.errorlist)) self.refresh() self.move(2, 0)
NUM_HEROES = 108 NUM_FEATURES = NUM_HEROES * 2 # Our training label vector, Y, is a bit vector indicating # whether radiant won (1) or lost (-1) NUM_MATCHES = matches.count() # Initialize training matrix X = np.zeros((NUM_MATCHES, NUM_FEATURES), dtype=np.int8) # Initialize training label vector Y = np.zeros(NUM_MATCHES, dtype=np.int8) widgets = [ FormatLabel('Processed: %(value)d/%(max)d matches. '), ETA(), Percentage(), ' ', Bar() ] pbar = ProgressBar(widgets=widgets, maxval=NUM_MATCHES).start() for i, record in enumerate(matches.find()): pbar.update(i) Y[i] = 1 if record['radiant_win'] else -1 players = record['players'] for player in players: hero_id = player['hero_id'] - 1 # If the left-most bit of player_slot is set, # this player is on dire, so push the index accordingly
# We do this here to prevent ROOT from messing with sys.argv import ROOT if not os.path.exists(args.outputdir): os.makedirs(args.outputdir) log.info("Finding input files for job: %s in %s" % (args.jobid, args.directory)) for sample_name, search_dir, all_files in find_sample_dirs( args.directory.split(':'), args.jobid): output_txt = os.path.join(args.outputdir, sample_name + '.txt') previous_files = get_previous_files(output_txt) with open_update_if_changed(output_txt, sample_name) as flist: pbar = ProgressBar(widgets=[FormatLabel( 'Checked %(value)i/' + str(len(all_files)) + ' files. '), ETA(), Bar('>')], maxval=len(all_files)).start() for i, file in enumerate(all_files): pbar.update(i) filepath = file if args.relative: filepath = os.path.relpath(file, search_dir) # Always write if we have found + checked it OK before if not args.nocheck and (args.force or file not in previous_files): tfile = ROOT.TFile.Open(file) if not tfile: log.warning("-- Can't open file: %s" % file) flist.write('# corrupt %s\n' % filepath) continue
def parse_mails(indir, outdir): global firstnames global lastnames firstnames = open(FIRSTNAMES_FILE, 'r').read().split('\n') lastnames = open(LASTNAMES_FILE, 'r').read().split('\n') all_references = {} all_other = [] print() print('****************** Mail Converter for mbox-Format Emails ************************') print(' V1.0 - 2017-08-01, Copyright (c) 2017 MUNICH AILABS GmbH') print(' Written: 2017-08-01 ... 15, Imdat Solak') print(' All rights reserved.') print('----------------------------------------------------------------------------------') print() if not os.path.exists(outdir): os.mkdir(outdir) # First collect all mails. # Some may contain references, others may not... print('Scanning directory [%s]... ' % indir, end='') filenames = [] sys.stdout.flush() for root, dirs, files in os.walk(indir): for filename in filter(lambda filename: filename.endswith('.eml'), files): if not filename.startswith('._'): filenames.append(os.path.join(indir, filename)) print('done') print('Parsing files...') widgets=[FormatLabel('File: [%(value)s/'+str(len(filenames))+']'), ' ', Percentage(), ' ', Bar(marker='@', left='[', right=']'), ' ', ETA()] pBar = ProgressBar(widgets=widgets, maxval=len(filenames)).start() for i, filename in enumerate(filenames): pBar.update(i, '') raw_message = codecs.open(filename, 'r', 'utf-8').read() msg = email.message_from_string(raw_message) mail = ASCIIMail(msg, os.path.join(outdir, os.path.basename(filename) + '.json')) mail.parse() if mail.reference != None: if all_references.get(mail.reference, None) == None: all_references[mail.reference] = mail else: all_references[mail.reference].append_reference(mail) else: all_other.append(mail) pBar.finish() print('Merging mails...') widgets=[FormatLabel('File: [%(value)s/'+str(len(all_references.keys()))+']'), ' ', Percentage(), ' ', Bar(marker='@', left='[', right=']'), ' ', ETA()] pBar = ProgressBar(widgets=widgets, maxval=len(all_references.keys())).start() # Now check for references... for i, a_ref in enumerate(all_references.keys()): pBar.update(i, '') amail = all_references[a_ref].parse_references() all_other.append(amail) pBar.finish() all_references = {} # Now save the found emails... print('Saving files... ', end='') sys.stdout.flush() for amail in all_other: amail.save() print('done)')
log.info("Merging %i input ROOT files", len(flat_files)) # Loop over (in, out) pairs for tree, h5name in zip(args.trees[::2], args.trees[1::2]): log.info("Writing input %s to output %s", tree, h5name) chain = ROOT.TChain(tree) for file in flat_files: chain.Add(file) entries = chain.GetEntries() log.info("There are %i rows in the input", entries) pbar = ProgressBar(widgets=[ FormatLabel('Processed %(value)i/' + str(len(flat_files)) + ' files. '), ETA(), Bar('>') ], maxval=len(flat_files)).start() table = None ROOT.TTreeCache.SetLearnEntries(1) time_in_read = 0 time_in_append = 0 processed_files = 0 try: for file_chunk in chunk_files(flat_files, args.chainsize):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) loss_func = torch.nn.CrossEntropyLoss() print('Start training.') best_ed = 999 early_stop_cnt = 0 for epoch in range(1, epochs + 1): print('Epoch: {}/{}'.format(epoch, epochs)) total_loss, total_acc, nonzeros = 0, 0, 0 widgets = [ FormatLabel(''), ' ', Bar('=', '[', ']'), ' - ', ETA(), ' ', FormatLabel('') ] pbar = ProgressBar(widgets=widgets, maxval=x_train.shape[0]) pbar.start() for i, (x_batch, y_batch) in enumerate(train_loader): # Tensor to variable x_batch = Variable(x_batch).cuda() y_batch = Variable(y_batch).cuda() # Optimize output = model(x_batch) loss = loss_func(output.view(-1, output.size(-1)),
'parents': parent_terms } } return data print('Loading de-wiktionary.json...', end='') sys.stdout.flush() pages = json.load(codecs.open('in/de-wiktionary.json', 'r', 'utf-8')) print(' done') print('Parsing...', end='') sys.stdout.flush() result = {} num_articles = len(pages) widgets = [ FormatLabel(' Article: %(message)s [%(value)s/' + str(num_articles) + ']'), ' ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pBar = ProgressBar(widgets=widgets, maxval=num_articles).start() for i, page in enumerate(pages): pBar.update(i, page['title']) result.update(clean_article(page)) pBar.finish() json.dump(result, codecs.open('out/de-wiktionary-db.json', 'w', 'utf-8'), indent=4)
def example16(): widgets = [FormatLabel('Bouncer: value %(value)d - '), BouncingBar()] pbar = ProgressBar(widgets=widgets) for i in pbar((i for i in range(180))): time.sleep(0.05)
def get_pattern_features(infile): pool = mp.Pool(processes=mp.cpu_count() - 1) text_dict = defaultdict(lambda: defaultdict(Counter)) tknzr = TweetTokenizer(reduce_len=True) pat_counter = Counter() pat_list = set() text_list = list() id_list = list() y = list() widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')] pbar = ProgressBar(widgets=widgets) with open(infile, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') header = next(spamreader) for row in pbar((row for row in spamreader)): text_list.append(row) id_list.append(row[0]) y.append(row[1]) pbar.finish() cpus = mp.cpu_count() - 1 unit = int(len(text_list) / cpus) text_chunks = [text_list[i * unit:i * unit + unit] for i in range(cpus)] text_chunks[cpus - 1].extend(text_list[unit * cpus:]) res = [ pool.apply_async(get_pattern_counter, (text_chunks[i], )) for i in range(cpus) ] pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(res)).start() index = 0 pool.close() pool.join() print("Extracting Eric's patterns...") for item in res: pat_counter += item.get() pbar.update(index + 1) index += 1 pbar.finish() pat_list = list(zip(*(pat_counter.most_common()[0:5000]))[0]) + list( zip(*(pat_counter.most_common()[-5000:-1]))[0]) print len(pat_list) X = list() pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(id_list)).start() index = 0 print("Generating Eric's pattern features...") for i, idx in enumerate(id_list): X.append([text_dict[idx][y[i]][pat] for pat in pat_list]) pbar.update(index + 1) index += 1 pbar.finish() X = np.array(X) y = np.array(y) print(X.shape) print(y.shape) return X, y, id_list
def example11(): widgets = [FormatLabel('Processed: %(value)d lines (in: %(elapsed)s)')] pbar = ProgressBar(widgets=widgets) for i in pbar((i for i in range(150))): time.sleep(0.1)
def gather_result(): start_cur = 0 end_cur = len(gl.list_OK) bar_length = len(gl.list_OK) if gl.g_time != 0: for index in range(len(gl.list_OK)): if right_swich(gl.list_OK[index][4]) > swich_time(gl.g_time): start_cur = index break if gl.g_number != -1: end_cur = start_cur + gl.g_number bar_length = gl.g_number widgets = [ 'Gather result: ', Percentage(), ' ', Bar(marker='|', left='|', right='|'), '[', FormatLabel('%(elapsed)s'), ']' ] if gl.g_number == 0 or len(gl.list_OK) == 0: end_cur = 100 bar_length = 100 pbar = ProgressBar(widgets=widgets, maxval=bar_length) pbar.start() diff_time = 0 glb_line = '' glb_line_number = 0 tar_handle = open(gl.file_output, 'w') if gl.file_flag == 2: tar_handle.write('QuoteId\tQuoteId\tOrderBookID\tDiffTime\n') elif gl.file_flag == 1: tar_handle.write('bid_quote_id\task_quote_id\tfeedcode\tdiff_time\n') try: for cur_i in range(start_cur, end_cur): if gl.g_number == 0 or len(gl.list_OK) == 0: pass else: if cur_i < len(gl.list_OK): for cur_j in range(3): glb_line = glb_line + get_value( gl.list_OK[cur_i][cur_j]) + '\t' diff_time = right_swich( gl.list_OK[cur_i][-2]) - right_swich( gl.list_OK[cur_i][-1]) if cur_i == 0: gl.max_time = diff_time gl.min_time = diff_time elif gl.max_time < diff_time: gl.max_time = diff_time elif gl.min_time > diff_time: gl.min_time = diff_time gl.average_time += diff_time glb_line = glb_line + str(diff_time) + '\t\n' tar_handle.write(glb_line) glb_line_number += 1 glb_line = '' pbar.update(cur_i) pbar.finish() finally: tar_handle.close() if gl.g_number == 0 or len(gl.list_OK) == 0: gl.min_time = 0 else: gl.average_time = gl.average_time / float(glb_line_number) print "--------------------------------" print "%15s%d\n%15s%d\n%15s%d\n%15s%f" % ( "Total Number:", glb_line_number, "Max_time:", gl.max_time, "Min_time:", gl.min_time, "Average_time:", gl.average_time) print "--------------------------------"