def pipeline_onehot(titles, descriptions, tags): # Create feature vectors of context and only keep images WITH context bar = Bar('Extracting features...', max=len(titles)) docs = [] for i in xrange(len(titles)): docs.append(u'{} {} {}'.format(titles[i], descriptions[i], ' '.join(tags[i]))) vectorizer = CountVectorizer(min_df=5) X = vectorizer.fit_transform(docs) bar = Bar('Extracting features...', max=len(docs)) idx_docs = [] for idoc, doc in enumerate(docs): idxs = X[idoc].nonzero()[1] + 1 idxs = idxs.tolist() idx_docs.append(idxs) bar.next() bar.finish() max_len = 500 bar = Bar('Merging into one matrix...', max=len(idx_docs)) for i, idx_doc in enumerate(idx_docs): features = np.zeros((1, max_len), np.int64) vec = np.array(idx_doc[:max_len]) features[0, :vec.shape[0]] = vec if i == 0: feat_flatten = csr_matrix(features.flatten()) else: feat_flatten = vstack([feat_flatten, csr_matrix(features.flatten())]) bar.next() bar.finish() return feat_flatten, vectorizer
def saveAverageImage(kitti_base, pos_labels, shape, fname, avg_num=None): num_images = float(len(pos_labels)) avg_num = min(avg_num, num_images) if avg_num is None: avg_num = num_images # avg_img = np.zeros((shape[0],shape[1],3), np.float32) avg_img = np.zeros(shape, np.float32) progressbar = ProgressBar('Averaging ' + fname, max=len(pos_labels)) num = 0 for label in pos_labels: if num >= avg_num: break num += 1 progressbar.next() sample = getCroppedSampleFromLabel(kitti_base, label) # sample = np.float32(sample) resized = resizeSample(sample, shape, label) resized = auto_canny(resized) resized = np.float32(resized) avg_img = cv2.add(avg_img, resized / float(avg_num)) progressbar.finish() cv2.imwrite(fname, avg_img)
def draw_poster(poster_text, textsize, inp): '''split out and highlight the words''' top_pad = 0.25 left_pad = 9 font = ImageFont.truetype("NotCourierSans.otf", textsize) #This font needs to be monopaced! im = Image.new("RGBA", (9933, 14043), "black") #A1 Size draw = ImageDraw.Draw(im) #Set up sheet to draw on print('Drawing text') bar = Bar('Processing', max=len(poster_text)) #Progress bar to entertain me while I watch this run for i, text in enumerate(poster_text): if "1969-07-21 02:56:48 CDR" in text: quote = "1969-07-21 02:56:48 CDR (TRANQ) That's one small step for man, one giant leap for mankind." text = text.split(quote) width_p1, h1 = draw.textsize(text[0], font=font) width_quote, h2 = draw.textsize(quote, font=font) draw.text((left_pad, int((i + top_pad) * textsize)), text[0], font=font, fill=(255,255,255,255)) #All text padded 4 pixels left draw.text((left_pad + width_p1, int((i + top_pad) * textsize)), quote, font=font, fill=(255,0,0,255)) draw.text((left_pad + width_p1 + width_quote, int((i + top_pad) * textsize)), text[1], font=font, fill=(255,255,255,255)) bar.next() else: draw.text((left_pad, int((i + top_pad) * textsize)), text, font=font, fill=(255,255,255,255)) bar.next() bar.finish() print('Saving image!') if inp == 'y': bleedx, bleedy = 10004, 14114 bufferx, buffery = int((bleedx - 9933) / 2), int((bleedy - 14043) / 2) bleed_im = Image.new("RGBA", (10004, 14114), "black") #Bleed area for printing bleed_im.paste(im, (bufferx, buffery)) bleed_im.save("output.png", "PNG") else: im.save("output.png", "PNG")
def hydrate(idlist_file="data/example_dataset_tweet_ids.txt"): """ This function reads a file with tweet IDs and then loads them through the API into the database. Prepare to wait quite a bit, depending on the size of the dataset. """ ids_to_fetch = set() for line in open(idlist_file, "r"): # Remove newline character through .strip() # Convert to int since that's what the database uses ids_to_fetch.add(int(line.strip())) # Find a list of Tweets that we already have ids_in_db = set(t.id for t in database.Tweet.select(database.Tweet.id)) # Sets have an efficient .difference() method that returns IDs only present # in the first set, but not in the second. ids_to_fetch = ids_to_fetch.difference(ids_in_db) logging.warning( "\nLoaded a list of {0} tweet IDs to hydrate".format(len(ids_to_fetch))) # Set up a progressbar bar = Bar('Fetching tweets', max=len(ids_to_fetch), suffix='%(eta)ds') for page in rest.fetch_tweet_list(ids_to_fetch): bar.next(len(page)) for tweet in page: database.create_tweet_from_dict(tweet) bar.finish() logging.warning("Done hydrating!")
def main(): infile = raw_input('Input file name: ') if os.path.exists(infile): print '\n[!] Loading PCAP file. Please wait, it might take a while...' ips = sorted(set(p[IP].src for p in PcapReader(infile) if IP in p)) total = len(ips) print '[!] Total number of IP addresses: %d\n' % total bar = Bar('Processing', max=total) for ip in ips: get_data(ip) bar.next() bar.finish() headers = ['IP', 'OWNER','COUNTRY', 'ORGANIZATION','SERVER','DESCRIPTION'] print '\n\n' print tabulate(table,headers,tablefmt='grid') if exceptions: print '\nExceptions:' for e in exceptions: print '*\t%s' % e print '\n\n[!] Done.\n\n' else: print '[!] Cannot find file "%s"\n\tExiting...' % infile sys.exit()
def read_and_gen(lyric_path,file_path): """ read file and generate mp3 sound file :param file_path: :return: """ #remove original before adding new content in it if os.path.exists(file_path): os.remove(file_path) with open(lyric_path, encoding="utf-8") as file: file = file.readlines() bar = Bar('Processing', max=file.__len__()) for line in file: if is_alphabet(line[0]): #line should be spoken in en speak = gtts_extends(line,lang='en') speak.sequence_save(file_path) if is_chinese((line[0])): speak = gtts_extends(line, lang='zh') speak.sequence_save(file_path) bar.next() bar.finish() print("transform success!")
def pipeline_pos(titles, descriptions, tags): def preprocess(inpt): return inpt # Create feature vectors of context and only keep images WITH context bar = Bar('Extracting features...', max=len(titles)) pos_collection = [] for i in xrange(len(titles)): # Stem words and remove stopwords for title... context = [] title = preprocess(titles[i].split(' ')) if title: context.append(title) # ... description (for each sentence) ... for desc in sent_tokenize(descriptions[i]): desc = preprocess(desc.split(' ')) if desc: context.append(desc) # ... and tagsc ts = preprocess(tags[i]) if ts: context.append(ts) pos = nltk.pos_tag_sents(context) pos = list(itertools.chain(*pos)) pos_collection.append(pos) bar.next() bar.finish() return pos_collection
def parse(self, dataset): """ :type dataset: nala.structures.data.Dataset """ outer_bar = Bar('Processing [SpaCy]', max=len(list(dataset.parts()))) for part in dataset.parts(): sentences = part.get_sentence_string_array() for index, sentence in enumerate(sentences): doc = self.nlp(sentence) for token in doc: tok = part.sentences[index][token.i] tok.features = { 'id': token.i, 'pos': token.tag_, 'dep': token.dep_, 'lemma': token.lemma_, 'prob': token.prob, 'is_punct': token.is_punct, 'is_stop': token.is_stop, 'cluster': token.cluster, 'dependency_from': None, 'dependency_to': [], 'is_root': False, } part.tokens.append(tok) for tok in doc: self._dependency_path(tok, index, part) part.percolate_tokens_to_entities() part.calculate_token_scores() part.set_head_tokens() outer_bar.next() outer_bar.finish() if self.constituency_parser == True: self.parser.parse(dataset)
class Closest(object): data = pd.DataFrame() cols = [] bar = None def __init__(self, df, cols, size): self.data = df self.cols = cols self.bar = Bar(message="Compressing Time", max=size, suffix="%(percent)d%% (%(index)d/%(max)d) ETA %(eta_td)s") return def __call__(self, row): self.bar.next() found = self.data[(self.data.restaurant_id == row.restaurant_id) & (self.data.date <= row.date)] if found.shape[0] == 0: # FIXME Do something smarter than averaging? found = self.data[(self.data.restaurant_id == row.restaurant_id)][self.cols].mean() else: found = found[self.cols].sum() # FIXME Sometimes NaNs appear if I am missing the restaurant ID. What to do? found.fillna(0, inplace=True) row[self.cols] = found return row def __del__(self): self.bar.finish()
def keyadd(name): bar = Bar('Processing', max=5) try: bar.next() nova('keypair-add', '--pub-key', '~/.ssh/id_rsa.pub', '%s' % name) except: # print "Key add error on %s" % name bar.next() try: bar.next() # print "Tryig to delete key" result = nova('keypair-delete', '%s' % name) # print result # print "Tryig to add key" bar.next() results = nova('keypair-add', '--pub-key', '~/.ssh/id_rsa.pub', '%s' % name) except: # print result print ''' Key deletion error on %s ''' % name bar.next() bar.finish() result = nova('keypair-list') print result
def torcURL(address, filename): print('cURL on ' + address + ' to ' + filename + '\n') bar = Bar('Running', max=100) for i in range(100): output = io.BytesIO() torcURL = pycurl.Curl() torcURL.setopt(pycurl.URL, address) torcURL.setopt(pycurl.PROXY, '127.0.0.1') torcURL.setopt(pycurl.PROXYPORT, SOCKS_PORT) torcURL.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME) torcURL.setopt(pycurl.WRITEFUNCTION, output.write) bar.next() bar.finish() try: torcURL.perform() return output.getvalue() fp = open(filename, 'wb') fp.write(output.getvalue().encode('utf-8').strip()) fp.close() except KeyboardInterrupt: raise KeyboardInterrupt except pycurl.error as e: return "Unable to reach %s (%s)" % (address, e) UnknownError() except Exception as e: UnknownError()
def gradient_descent(X, Y, iter, alpha): (rows, cols) = X.shape Xt = X.T w = numpy.zeros((len(Xt), 1)) print w.shape bar = Bar('iterations', max=iter) for i in range(0, iter): pw = w dw = 2*matrix.dot(matrix.dot(Xt,X), w) - matrix.dot(Xt, Y) # if (True): # # print "alpha " + str(alpha) # # print "E is " + str(dw.T.dot(dw).sum()) # # print dw # print w w = w - alpha*dw/rows diff =numpy.absolute(w-pw).sum() print "Diff is %f " % diff if (diff < 0.000001): bar.finish() return w # raw_input() bar.next() bar.finish() return w
def main(args): d = json.load(open(args.c, 'r')) np.random.seed(1234) im2id = {} id2cap = {} print 'img 2 id....' for im in d['images']: im2id[im['file_name']] = im['id'] bar = Bar('id 2 cap...', max=len(d['annotations'])) for ann in d['annotations']: cap = nltk.word_tokenize(ann['caption']) cap = ' '.join(cap).lower() if ann['image_id'] in id2cap: id2cap[ann['image_id']].append(cap) else: id2cap[ann['image_id']] = [cap] bar.next() bar.finish() with open(args.s, 'r') as f: images = f.read().split() refs = [] for im in images: refs.append('<>'.join(id2cap[im2id[im]])) with open(args.saveto, 'w') as f: print >>f, '\n'.join(refs)
def evaluate(train_file_path, test_num, tagger, output_file_path): sents = parse_train_data(train_file_path) test_start = len(sents) - test_num - 1 test_data = sents[test_start:len(sents)-1] train_data = sents[0:test_start+1] print 'Training with {0} sentences'.format(len(train_data)) tagger.train(train_data) output = open(output_file_path, 'w') correct = 0 total = 0 bar = Bar('Testing with {0} sentences'.format(len(test_data)), max=len(test_data)) for s in test_data: tagged = tagger.tag(remove_tags(s)) # evaluate correct += evaluate_sentence(s, tagged) total += len(tagged) # write words = [] for t in tagged: words.append(t[0] + '_' + t[1]) output.write('\t'.join(words) + '\n') bar.next() bar.finish() output.close() return correct / float(total) * 100
def get_list(filename): """ Creates an array of objects out of input training file ================================== Returns: * array of objects where each object corresponds to a document ================================== """ fo = open(filename) lines = fo.readlines() fo.close() total = len(lines) obj_arr = [] vec_arr = [] bar = Bar("Processing", max=total, suffix='%(percent)d%% | %(index)d of %(max)d | %(eta)d seconds remaining.') num = 0 for each in lines: send_obj = files(each.split('\n')[0].split('\t')) send_obj.set_word_count(5) send_obj.set_pos_features() send_obj.set_punctuation_features() send_obj.set_vectors() obj_arr.append(send_obj) bar.next() bar.finish() return obj_arr
def set_image_objects(self): landsat8 = "(acquisitionDate >= date'2013-01-01' AND acquisitionDate <= date'2016-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'OLI') AND (cloudCover <= 20)" landsat7 = "(acquisitionDate >= date'2003-01-01' AND acquisitionDate <= date'2016-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'ETM_SLC_OFF') AND (cloudCover <= 20)" landsat4_5 = "(acquisitionDate >= date'1982-01-01' AND acquisitionDate <= date'2011-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'TM') AND (cloudCover <= 20)" landsat1_5 = "(acquisitionDate >= date'1972-01-01' AND acquisitionDate <= date'2013-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'MSS') AND (cloudCover <= 20)" queries_name = ["landsat8","landsat7","landsat4_5","landsat1_5"] queries = [landsat8,landsat7,landsat4_5,landsat1_5] # query = self._query(parms) obj = [] count = 0 for q in queries: parms = { "f":"json", "where":q, "geometry":self.bounding_box["geometry"], "returnGeometry":"false", "spatialRel":"esriSpatialRelIntersects", "geometryType":"esriGeometryEnvelope", "inSR":self.bounding_box["geometry"]["spatialReference"]["wkid"], "outSR":self.bounding_box["geometry"]["spatialReference"]["wkid"], "outFields":"*", "orderByFields":"dayOfYear" } query = self._query(parms) bar = Bar("Requesting data: "+queries_name[count] , max=len(queries)) for i in query["features"]: obj.append(i) bar.next() bar.finish() count = count + 1 return obj
def tokenize_proteins(data, msg='Processing proteins'): """Distribute all poses into either decoys list or actives OrderedDict. Poses placed into the actives OrderedDict are further organized into sublists for each ligand. args: @data list of string lines containing pose data @msg string message to display in progress bar returns: @actives OrderedDict of all active poses gathered from data @decoys list of all decoy poses gathered from data """ actives = OrderedDict() decoys = list() bar = Bar(msg, max=len(data)) for i, line in enumerate(data): bar.next() pose = posedict(line) # Token -> List if pose['label'] == 1: # Pose -> Decoys pose['id'] = pose['ligand'] + '-' + str(i) actives.setdefault(pose['ligand'], []).append(pose) else: # Pose -> Actives decoys.append(pose) bar.finish() print "" return actives, decoys
def main(argv): args = argparser.parse_args() print >> sys.stderr, '# Start: Keyword Data: %s, %s, %s, %s' % (args.cc, args.week, args.pages, datetime.datetime.now().time().isoformat()) ga, gsc = initialize_service(argv, "analytics"), initialize_service(argv, "webmasters") print '"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"' % ("cc", "website", "url", "date", "keyword", "impressions", "clicks", "ctr", "position", "sessions (week)") bar = Bar('Processing', max=args.pages, suffix ='%(percent).1f%% - %(eta)ds') for website in GA_IDS[args.cc]: urls = get_top_landing_pages(ga, args.cc, website, args.week, args.pages) for row in urls: data = [] # we switched from http to https between week 3 and 4 if (args.week <= 4 and args.cc != 'VN') or website != "IPRICE": data.extend(get_keyword_data(gsc, args.cc, website, args.week, row[0][1:], "http")) if (args.week >=3 or args.cc == 'VN') and website == "IPRICE": data.extend(get_keyword_data(gsc, args.cc, website, args.week, row[0][1:], "https")) output(args.cc, website, row[0], row[1], data) bar.next() bar.finish() print >> sys.stderr, '# End: Keyword Data: %s, %s, %s, %s' % (args.cc, args.week, args.pages, datetime.datetime.now().time().isoformat())
def clean(): """kills all the instances with prefix prefix-* and in error state""" global servers global bar try: _refresh_servers() if len(servers) == 0: print 'Found 0 instances to kill' else: for index in servers: server = servers[index] if server['status'] == 'ERROR': list.append(index) names.append(server['name']) print 'Starting parallel Delete' bar = Bar('Deleting', max=len(servers) + 3) bar.next() pool = Pool(processes=maxparallel) bar.next() result = pool.map(_del_server, list) bar.next() bar.finish() except: print 'Found 0 instances with status error to kill' menu()
def kill(): """kills all the instances with prefix prefix-*""" global servers global bar try: list = [] names = [] _refresh_servers() if len(servers) == 0: print 'Found 0 instances to kill' else: for index in servers: server = servers[index] print 'Found %(name)s to kill' % server list.append(index) names.append(server['name']) bar = Bar('Deleting', max=len(servers) + 3) bar.next() pool = Pool(processes=maxparallel) bar.next() result = pool.map(_del_server, list) bar.next() bar.finish() except: # print e print 'Found 0 instances to kill' menu()
def main(argv): args = argparser.parse_args() print >> sys.stderr, '# Start: Matching: %s' % (datetime.datetime.now().time().isoformat()) masterbrain = read(args.masterbrain) keywords = read(args.keywords) bar = Bar('Processing', max=len(masterbrain), suffix ='%(percent).1f%% - %(eta)ds') regex = {} for keyword in keywords: regex[keyword] = re.compile(r'\b({0})\b'.format(keyword)) matches = 0 for string in masterbrain: for keyword in keywords: if regex[keyword].search(string): matches = matches + 1 print 1, "\t", string, "\t", keyword break else: print 0, "\t", string bar.next() bar.finish() print matches, "/", len(masterbrain) print >> sys.stderr, '# End: Matching: %s' % (datetime.datetime.now().time().isoformat())
def main(argv): args = argparser.parse_args() print >> sys.stderr, '# Start: Adwords Data: %s, %s' % (args.cc, datetime.datetime.now().time().isoformat()) service = initialize_service() keywords = read_file(args.file) print '"%s"\t"%s"\t"%s"\t"%s"' % ("keyword", "sv (month)", "competition", "cpc ($)") bar = Bar('Processing', max=len(keywords), suffix ='%(percent).1f%% - %(eta)ds') if args.stats: # pagination of 800 items kws = keywords while len(kws) > 0: page = kws[0:PAGE_SIZE] kws = kws[PAGE_SIZE:] output(query_adwords(service, args.cc, page, "STATS")) bar.next(len(page)) elif args.ideas: # pagination of 1 item for kw in keywords: output(get_keyword_suggestions(service, args.cc, "IDEAS")) bar.next() bar.finish() print >> sys.stderr, '# End: Adwords Data: %s, %s' % (args.cc, datetime.datetime.now().time().isoformat())
def average_image(pos_region_generator, shape, avg_num=None): pos_regions = list(pos_region_generator) num_images = float(len(pos_regions)) if avg_num is None: avg_num = num_images else: avg_num = min(avg_num, num_images) window_dims = (shape[1], shape[0]) # avg_img = np.zeros((shape[0],shape[1],3), np.float32) avg_img = np.zeros(shape, np.float32) progressbar = ProgressBar('Averaging ', max=avg_num) num = 0 for reg in pos_regions: if num >= avg_num: break num += 1 progressbar.next() resized = reg.load_cropped_resized_sample(window_dims) resized = auto_canny(resized) resized = np.float32(resized) avg_img = cv2.add(avg_img, resized / float(avg_num)) progressbar.finish() return avg_img
def getUsers(hubname): log = open(HubAnalyzer.logfile, "a") print("hub: " + hubname + " ----------------- ", file=log) print(time.strftime("%H:%M:%S"), file=log) # clean the file to write users to url = HubAnalyzer.hubname2link(hubname) output_filename = "data/hubs/" + hubname # if data is here, do nothing if os.path.isfile(output_filename) and not HubAnalyzer.enforce_download_in_presence_of_data: print("data is already here, abort this url", file=log) return None output_file = open(output_filename, "w") try: last_page_num = int(HubAnalyzer.getLastPageNumber(url)) except Exception as err: print("URL is broken, abort the url", file=log) log.flush() os.remove(output_filename) raise Exception("Cannot analyze the page, please, check the url below: \n" + url) # get connection to habrahabr-hub suffix = "/subscribers/rating/page" userlist_url = url + suffix http = urllib3.PoolManager() if HubAnalyzer.report_downloading_progress: HubAnalyzer.get_hub_description(hubname) bar = Bar("Downloading: " + hubname, max=last_page_num, suffix="%(percent)d%%") for i in range(1, last_page_num + 1): user_page = userlist_url + str(i) print(user_page, file=log) log.flush() try: response = http.request("GET", user_page) except urllib3.exceptions.HTTPError as err: if err.code == 404: print(user_page + " !! 404 !!", file=log) log.flush() output_file.close() os.remove(output_filename) raise ("Hub is not found, please, check the url") else: print(user_page + " PARSING ERROR ", file=log) log.flush() output_file.close() os.remove(output_filename) raise Exception("Error: cannot parse the page!") html = response.data soup = BeautifulSoup(html) usersRow = soup.find_all(class_="user ") for userRow in usersRow: username = userRow.find(class_="username").text print(username, file=output_file) output_file.flush() if HubAnalyzer.report_downloading_progress: bar.next() # finalize and close everything if HubAnalyzer.report_downloading_progress: bar.finish() output_file.close() log.close()
def main(): attempt = 0 data = None while not data and attempt < 3: attempt += 1 try: request = urllib2.Request('http://openweathermap.org/help/city_list.txt') response = urllib2.urlopen(request) data = response.readlines() except: print "failed %d times, trying again" % attempt if not data: print "the program fail, please check your internt and access the program again" sys.exit() country_initials = str() firstline = True count = 0 length = list() length = len(data) bar = Bar('Processing', max=length) for line in data: # Appending each line that the country initials are equal to the user input if firstline: firstline = False continue count += 1 country_initials = line[-3] + line[-2] elements = line.split() city = " ".join(elements[1:-3]) city = city.strip().replace(" ", "-") connection = pymysql.connect(host='localhost', user='******', password='******', db='weather', cursorclass=pymysql.cursors.SSCursor) with connection.cursor() as cursor: try: cursor.execute("select count(*) from weather.cities where name = %s", city) city_exists = cursor.fetchone()[0] if city_exists == 1: cursor.execute( "select TIMESTAMPDIFF(minute,(select last_updated from weather.cities where name = %s), now())", city) time_dif = cursor.fetchone()[0] if time_dif > 60: update_attributes(city, country_initials) bar.next() else: bar.next() continue else: update_attributes(city, country_initials) bar.next() except UnicodeEncodeError: pass connection.commit() bar.finish()
def editorial_publish(guides, endpoint, function_class, user_agent, nailgun_bin, content_generator): """ takes care of publishing the editorial content for the guides. """ # init the nailgun thing for ed content generation. nailguninit(nailgun_bin,content_generator) searches= {} pbar = Bar('extracting editorial content for guides:',max=len(guides)+1) pbar.start() error = False for i, guide in enumerate(guides): jsonguide = None with open(guide,'r') as g: jsonguide = json.load(g) if not jsonguide: logging.error('could not load json from {0}'.format()) error = True continue search = cityinfo.cityinfo(jsonguide) uri = cityres.cityres(search,endpoint) if not uri: logging.error( 'no dbpedia resource was found for {0}'.format(guide)) error = True continue urls = urlinfer.urlinferdef([unquote(uri)]) if len(urls) < 1: logging.error('no wikipedia/wikivoyage urls found/inferred'\ ' for resource {0}'.format(uri)) error = True continue content = editorial_content(urls,function_class,user_agent) if not content: logging.error('no editorial content could be'\ ' generated for {0}'.format(guide)) error = True continue #insert the content into the guide jsonsert.jsonsert(content, guide) logging.info('editorial content for {0} sucessfully'\ ' inserted.'.format(guide)) pbar.next() pbar.finish() return error
def evolve(self, population, cxpb, mutpb, mutfq, ngen, goal): # Cheapest classifier. clf = LinearRegression(normalize=True) # Evaluate fitnesses of starting population. fitness_list = map(lambda x: self.evaluate(x, clf), population) # Assign fitness values. for individual, fitness in zip(population, fitness_list): individual.fitness.values = fitness best = max(population, key=lambda x: x.fitness.values[0]) # So that we know things are happening. bar = Bar('Evolving', max=ngen) # Evolution! for gen in xrange(ngen): if best.fitness.values[0] > goal: break # Select the next generation of individuals. offspring = [] offspring.append(best) offspring += tools.selTournament(population, len(population)-1, 10) offspring = map(self.toolbox.clone, offspring) # Apply crossovers. for child_a, child_b in zip(offspring[::2], offspring[1::2]): # Staggered. if random.random() < cxpb: self.crossover(child_a, child_b, cxpb) del child_a.fitness.values del child_b.fitness.values # Apply mutations. for child in offspring: if random.random() < mutpb: self.mutate(child, mutfq) del child.fitness.values # Reevaluate fitness of changed individuals. new_children = [e for e in offspring if not e.fitness.valid] fitness_list = map(lambda x: self.evaluate(x, clf), population) for individual, fitness in zip(new_children, fitness_list): individual.fitness.values = fitness # Replace old population with new generation. best = max(population, key=lambda x: x.fitness.values[0]) population = offspring # Progress! bar.next() # Done! Return the most fit evolved individual. bar.finish() return best
def do_epoch(mode, epoch, skipped=0): # mode is 'train' or 'test' y_true = [] y_pred = [] avg_loss = 0.0 prev_time = time.time() batches_per_epoch = dmn.get_batches_per_epoch(mode) if mode=="test": batches_per_epoch=min(1000,batches_per_epoch) bar=Bar('processing',max=batches_per_epoch) for i in range(0, batches_per_epoch): step_data = dmn.step(i, mode) prediction = step_data["prediction"] answers = step_data["answers"] current_loss = step_data["current_loss"] current_skip = (step_data["skipped"] if "skipped" in step_data else 0) log = step_data["log"] skipped += current_skip if current_skip == 0: avg_loss += current_loss for x in answers: y_true.append(x) for x in prediction.argmax(axis=1): y_pred.append(x) # TODO: save the state sometimes if (i % args.log_every == 0): cur_time = time.time() #print (" %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t skipped: %d \t %s \t time: %.2fs" % # (mode, epoch, i * args.batch_size, batches_per_epoch * args.batch_size, # current_loss, avg_loss / (i + 1), skipped, log, cur_time - prev_time)) prev_time = cur_time if np.isnan(current_loss): print "==> current loss IS NaN. This should never happen :) " exit() bar.next() bar.finish() avg_loss /= batches_per_epoch print "\n %s loss = %.5f" % (mode, avg_loss) print "confusion matrix:" print metrics.confusion_matrix(y_true, y_pred) accuracy = sum([1 if t == p else 0 for t, p in zip(y_true, y_pred)]) print "accuracy: %.2f percent" % (accuracy * 100.0 / batches_per_epoch / args.batch_size) if len(accuracies)>0 and accuracies[-1]>accuracy: dmn.lr=dmn.lr*args.learning_rate_decay accuracies.append(accuracy) return avg_loss, skipped
def save_regions(reg_gen, num_regions, window_dims, save_dir): progressbar = ProgressBar('Saving regions', max=num_regions) index = 0 for img_region in itertools.islice(reg_gen, 0, num_regions): fname = os.path.join(save_dir, '{:06d}.png'.format(index)) index += 1 sample = img_region.load_cropped_resized_sample(window_dims) cv2.imwrite(fname, sample) progressbar.next() progressbar.finish()
def get_stale_files(self, media_files): django_models_with_file_fields = self.get_django_models_with_file_fields() stale_files = [] bar = Bar('Analyzing media files', max=len(media_files)) for media_file in media_files: if not self.remove_file_if_not_exists_in_db(media_file, django_models_with_file_fields): stale_files.append(media_file) bar.next() bar.finish() return stale_files
def get_darwin_dataset(img_dir, train_val): json_file = os.path.join(img_dir, train_val, train_val + ".json") with open(json_file) as f: imgs = json.load(f) imgs = imgs[0:10] dataset_dicts = [] bar = Bar('Importing Dataset', max=len(imgs)) for idx, img in enumerate(imgs): record = {} filename = os.path.join(img_dir, 'images', img["image"]["original_filename"]) height, width = cv2.imread(filename).shape[:2] record["file_name"] = filename record["image_id"] = idx record["height"] = height record["width"] = width annos = img["annotations"] objs = [] for anno in annos: poly, bbox = convert_to_rle(anno, height, width) #check bounding boxes are healthy # test_mask = pycocotools.mask.decode(poly) # mask_img = Image.fromarray(test_mask.astype(np.bool)).convert('RGB') # draw = ImageDraw.Draw(mask_img) # draw.rectangle(bbox, fill=None, outline='red', width=3) # mask_img.save('img.jpg') obj = { "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS, "segmentation": poly, "category_id": 0, # change in the future for more than one category } objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) # # check masks are healthy # test_mask = np.zeros([height, width, len(record["annotations"])], dtype=np.uint8) # for idx, obj in enumerate(record["annotations"]): # # decode RLE for all objects, create global mask and save image # test_mask[:,:,idx] = pycocotools.mask.decode(obj['segmentation']) # Image.fromarray(np.sum(test_mask, axis=2).astype(np.bool)).save('masks/' + img["image"]["original_filename"]) bar.next() bar.finish() return dataset_dicts
def update(self, _entries, progress=True): MongoDBController().start_if_not_running() if type(_entries) == dict: entries = [_entries] else: entries = _entries if progress: bar = Bar('Cloudmesh Database Update', max=len(entries)) result = [] for entry in entries: if progress: bar.next() if 'cm' not in entry: print("UPDATE ERROR") VERBOSE(entry) raise ValueError("The cm attribute is not in the entry") entry['cm']['collection'] = "{cloud}-{kind}".format(**entry["cm"]) # noinspection PyUnusedLocal try: self.col = self.db[entry['cm']['collection']] old_entry = self.col.find_one({ "cm.kind": entry["cm"]["kind"], "cm.cloud": entry["cm"]["cloud"], "cm.name": entry["cm"]["name"] }) if old_entry is not None: cm = dict(old_entry['cm']) cm.update(entry['cm']) cm['modified'] = str(datetime.utcnow()) # entry['cm']['created'] = cm['created'] entry['cm'] = cm post = self.col.replace_one( { "cm.kind": entry['cm']["kind"], "cm.cloud": entry['cm']["cloud"], "cm.name": entry['cm']["name"] }, entry, upsert=True) else: entry['cm']['created'] = entry['cm']['modified'] = str( datetime.utcnow()) self.col.insert_one(entry) except Exception as e: Console.error( "uploading document\n{entry}\n-------\n\n".format( entry=str(entry))) pass result.append(entry) if progress: bar.finish() return result
def SVDC_heatmap_generator(df, period_of_interest, prediction_year=2012, \ epidemic_classification_dict=None, training_year_window='ALL', t0_vector=None, \ p_vector=None, classifier='SVM', modes=[0], add_peaks=False,\ add_runoff_binary=False, verbose=False, variables=['precip', 'temp']): ''' - p_max, p_min: sets the bounds for the period length vector - period_of_interest = () #initial and final date that contains the period of interest (poi). the period of interest defines the starting and finishing dates for the SVD classifierself. e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end on 01-02 of the next year -prediction_year -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1} v2.: Version two of heatmap generators utilized 3 modes rather than 2 and also incorporates the average number of peaks as extra dimensions prior to the classifier phase ''' #Generate grid based on p and t0 vectors distance_grid = np.zeros([len(p_vector), len(t0_vector)]) years = [] for i in range(df.index.shape[0]): years.append(df.index[i].year) years = sorted(list(set(years))) years_before_prediction = years.index(prediction_year) if training_year_window == 'ALL': training_years = years[0:years_before_prediction] n_years = years_before_prediction elif training_year_window < years_before_prediction: training_years = years[years_before_prediction - training_year_window:years_before_prediction] n_years = training_year_window else: print( "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size" .format(training_year_window)) if verbose: print('{0} years detected within dataframe: {1}.'.format( len(years), years)) print('{0} Years before prediction: {1}'.format( n_years, training_years)) # check if t0 dates are within dates_within_poi = [] for d in t0_vector: if '{0}'.format(prediction_year) + d[4:] in df[ period_of_interest[0]:period_of_interest[1]].index: dates_within_poi.append(d) if len(d) > 0: print( '{0} dates from t0_vector are inside period_of_interest range: {1}' .format(len(dates_within_poi), dates_within_poi)) #Enter main loop print('Initiating heatmap loop.') bar = Bar('Processing', max=len(p_vector)) for i, p in enumerate(p_vector): bar.next() for j, t0 in enumerate(t0_vector): if verbose: print('Reshaping data') X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) if verbose: print('Reshaping data done') ''' Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign the epidemic classification of year Y to the label for Y-1 ''' if X is not None: X_train = X[:, :-1] X_predict = X[:, -1] Y_train = [] for year in training_years[: -1]: # Can take out of loop but keeping for clear reading purposes Y_train.append(epidemic_classification_dict[year + 1]) Y_train = np.vstack(Y_train) Y_predict = epidemic_classification_dict[prediction_year] # Perform svd U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = np.vstack([ projections[:, modes], np.matmul(X_predict.reshape([1, -1]), U[:, modes]) ]) ''' if not np.equal(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]).reshape(1,-1)).all(): print('WARNING! projections and prediction sample matmul are not equal') print(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes])) time.sleep(10) if verbose: print('Verifying predict_projection is correct = {0},{1}, {2}'.format(projections,projection_predict, np.matmul(X_predict.reshape([1,-1]),U[:,modes]))) ''' ''' Merging SVD projections average_peak_frequencies for each year. They should have the same length ''' if add_peaks: # This function returns the delta value stated in Stolerman's paper average_peak_frequencies = SVDC_get_apfs(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) classifier_dataset = np.hstack( [projections, average_peak_frequencies]) else: classifier_dataset = projections if add_runoff_binary: # This function returns the delta value stated in Stolerman's paper average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) classifier_dataset = np.hstack( [projections, average_runoff]) else: classifier_dataset = projections classifier_dataset_train = classifier_dataset[:-1, :] classifier_dataset_predict = classifier_dataset[-1, :] if verbose: print(classifier_dataset_train, classifier_dataset_predict) if classifier == 'svm': mod = svm.SVC(kernel='rbf', gamma=1, C=1, cache_size=400, max_iter=100000) elif classifier == 'forest': mod = RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0) if verbose: ('Fitting with projections shape {0} and target shape {1}'. format(classifier_dataset_train.shape, Y_predict)) mod.fit(classifier_dataset_train, Y_train.ravel()) pred = mod.predict(classifier_dataset_predict.reshape(1, -1)) distance_grid[i, j] = (pred == Y_predict) else: distance_grid[i, j] = -1 bar.finish() return distance_grid
def step(split, epoch, opt, data_loader, model, optimizer=None): if split == 'train': ##### 主要针对 batch normalization 和 dropout model.train() else: model.eval() crit = torch.nn.MSELoss() ### 定义损失函数 acc_idxs = data_loader.dataset.acc_idxs edges = data_loader.dataset.edges shuffle_ref = data_loader.dataset.shuffle_ref mean = data_loader.dataset.mean std = data_loader.dataset.std convert_eval_format = data_loader.dataset.convert_eval_format Loss, Acc = AverageMeter(), AverageMeter() data_time, batch_time = AverageMeter(), AverageMeter() preds = [] nIters = len(data_loader) bar = Bar('{}'.format(opt.exp_id), max=nIters) end = time.time() for i, batch in enumerate(data_loader): data_time.update(time.time() - end) input, target, meta = batch['input'], batch['target'], batch['meta'] input_var = input.cuda() target_var = target.cuda() #### 【16,64,64】的关键点热图 output = model(input_var) #### 预测与真实heatmap热图之间的损失计算 shape: [batch, 16, 64, 64] loss = crit(output[-1]['hm'], target_var) #### 计算损失 ### tensor([xxxx*xxx], cuda:0) tensor值 torch.cuda.FloatTensor for k in range(opt.num_stacks - 1): loss += crit(output[k], target_var) if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() else: input_ = input.cpu().numpy().copy() input_[0] = flip(input_[0]).copy()[np.newaxis, ...] # input_flip_var = torch.from_numpy(input_).cuda(device=opt.device, non_blocking=True) input_flip_var = torch.from_numpy(input_).cuda() output_flip = model(input_flip_var) output_flip = shuffle_lr( flip(output_flip[-1]['hm'].detach().cpu().numpy()[0]), shuffle_ref) output_flip = output_flip.reshape( 1, opt.num_output, opt.output_h, opt.output_w) ####### output_ = (output[-1].detach().cpu().numpy() + output_flip) / 2 # output_flip = torch.from_numpy(output_flip).cuda(device=opt.device, non_blocking=True) output_flip = torch.from_numpy(output_flip).cuda() output[-1]['hm'] = (output[-1]['hm'] + output_flip) / 2 pred, conf = get_preds(output[-1]['hm'].detach().cpu().numpy(), True) #### 获取每个热图的峰值点作为可能的骨骼点位置 preds.append(convert_eval_format(pred, conf, meta)[0]) Loss.update(loss.detach()[0], input.size(0)) Acc.update(accuracy(output[-1]['hm'].detach().cpu().numpy(), target_var.detach().cpu().numpy(), acc_idxs)) #### acc_idxs = [0, 1, 2, 3, 4, 5, 10, 11, 14, 15] batch_time.update(time.time() - end) end = time.time() if not opt.hide_data_time: time_str = ' |Data {dt.avg:.3f}s({dt.val:.3f}s)' \ ' |Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time) else: time_str = '' Bar.suffix = '{split}: [{0}][{1}/{2}] |Total {total:} |ETA {eta:}' \ '|Loss {loss.avg:.5f} |Acc {Acc.avg:.4f}' \ '{time_str}'.format(epoch, i, nIters, total=bar.elapsed_td, eta=bar.eta_td, loss=Loss, Acc=Acc, split=split, time_str=time_str) if opt.print_iter > 0: if i % opt.print_iter == 0: print('{}| {}'.format(opt.exp_id, Bar.suffix)) else: bar.next() if opt.debug >= 2: gt = get_preds(target.cpu().numpy()) * 4 pred = get_preds(output[-1]['hm'].detach().cpu().numpy()) * 4 debugger = Debugger(ipynb=opt.print_iter > 0, edges=edges) img = (input[0].numpy().transpose(1, 2, 0) * std + mean) * 256 img = img.astype(np.uint8).copy() debugger.add_img(img) debugger.add_mask( cv2.resize(target[0].numpy().max(axis=0), (opt.input_w, opt.input_h)), img, 'target') debugger.add_mask( cv2.resize(output[-1]['hm'][0].detach().cpu().numpy().max(axis=0), (opt.input_w, opt.input_h)), img, 'pred') debugger.add_point_2d(pred[0], (255, 0, 0)) debugger.add_point_2d(gt[0], (0, 0, 255)) debugger.show_all_imgs(pause=True) bar.finish() return {'loss': Loss.avg, 'acc': Acc.avg, 'time': bar.elapsed_td.total_seconds() / 60.}, preds
class MoveFilesFromStorageController: """Class that executes file moves from a direct ingest Google Cloud Storage bucket to the appropriate ingest bucket. """ FILE_TO_MOVE_RE = \ re.compile(r'^(processed_|unprocessed_|un)?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}:\d{6}(raw|ingest_view)?.*)') QUEUES_TO_PAUSE = { DIRECT_INGEST_SCHEDULER_QUEUE_V2, DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 } PAUSE_QUEUE_URL = 'https://cloudtasks.googleapis.com/v2/projects/{}/locations/us-east1/queues/{}:pause' PURGE_QUEUE_URL = 'https://cloudtasks.googleapis.com/v2/projects/{}/locations/us-east1/queues/{}:purge' CURL_POST_REQUEST_TEMPLATE = 'curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" {}' def __init__(self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str]): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if self.file_type_to_move != self.destination_file_type and \ self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED: raise ValueError( 'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED' ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' ) def run_move(self): """Main method of script - executes move, or runs a dry run of a move.""" if self.dry_run: logging.info("Running in DRY RUN mode for region [%s]", self.region) else: i = input( f"This will move [{self.region}] files in [{self.project_id}] that were uploaded starting on date" f"[{self.start_date_bound}] and ending on date [{self.end_date_bound}]. Type {self.project_id} " f"to continue: ") if i != self.project_id: return if self.dry_run: logging.info("DRY RUN: Would pause [%s] in project [%s]", self.QUEUES_TO_PAUSE, self.project_id) else: i = input(f"Pausing queues {self.QUEUES_TO_PAUSE} in project " f"[{self.project_id}] - continue? [y/n]: ") if i.upper() != 'Y': return self.pause_and_purge_queues() date_subdir_paths = self.get_date_subdir_paths() if self.dry_run: logging.info("DRY RUN: Found [%s] dates to move", len(date_subdir_paths)) else: i = input(f"Found [{len(date_subdir_paths)}] dates to move - " f"continue? [y/n]: ") if i.upper() != 'Y': return thread_pool = ThreadPool(processes=12) files_to_move = self.collect_files_to_move(date_subdir_paths, thread_pool) self.move_files(files_to_move, thread_pool) thread_pool.close() thread_pool.join() self.write_moves_to_log_file() if self.dry_run: logging.info( "DRY RUN: See results in [%s].\n" "Rerun with [--dry-run False] to execute move.", self.log_output_path) else: logging.info( "Move complete! See results in [%s].\n" "\nNext steps:" "\n1. (If doing a full re-ingest) Drop Google Cloud database for [%s]" "\n2. Resume queues here:", self.log_output_path, self.project_id) for queue_name in self.QUEUES_TO_PAUSE: logging.info("\t%s", self.queue_console_url(queue_name)) def get_date_subdir_paths(self) -> List[str]: return gsutil_get_storage_subdirs_containing_file_types( storage_bucket_path=self.storage_bucket.abs_path(), file_type=self.file_type_to_move, upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound) def collect_files_to_move(self, date_subdir_paths: List[str], thread_pool: ThreadPool) -> List[str]: """Searches the given list of directory paths for files directly in those directories that should be moved to the ingest directory and returns a list of string paths to those files. """ msg_prefix = 'DRY_RUN: ' if self.dry_run else '' self.collect_progress = Bar(f"{msg_prefix}Gathering paths to move...", max=len(date_subdir_paths)) collect_files_res = thread_pool.map(self.get_files_to_move_from_path, date_subdir_paths) if not self.collect_progress: raise ValueError('Progress bar should not be None') self.collect_progress.finish() return [f for sublist in collect_files_res for f in sublist] def move_files(self, files_to_move: List[str], thread_pool: ThreadPool): """Moves files at the given paths to the ingest directory, changing the prefix to 'unprocessed' as necessary. For the given list of file paths: files_to_move = [ 'storage_bucket/path/to/processed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv' ] Will run: gsutil mv gs://storage_bucket/path/to/processed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv \ unprocessed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv Note: Move order is not guaranteed - file moves are parallelized. """ msg_prefix = 'DRY_RUN: ' if self.dry_run else '' self.move_progress = Bar(f"{msg_prefix}Moving files...", max=len(files_to_move)) thread_pool.map(self.move_file, files_to_move) if not self.move_progress: raise ValueError('Progress bar should not be None') self.move_progress.finish() def queue_console_url(self, queue_name: str): """Returns the url to the GAE console page for a queue with a given name.""" return f'https://console.cloud.google.com/cloudtasks/queue/{queue_name}?project={self.project_id}' def do_post_request(self, url: str): """Executes a googleapis.com curl POST request with the given url. """ res = subprocess.Popen(self.CURL_POST_REQUEST_TEMPLATE.format(url), shell=True, stdout=subprocess.PIPE) stdout, _stderr = res.communicate() response = json.loads(stdout) if 'error' in response: raise ValueError(response['error']) def pause_queue(self, queue_name: str): """Posts a request to pause the queue with the given name.""" logging.info("Pausing [%s] in [%s]", queue_name, self.project_id) self.do_post_request( self.PAUSE_QUEUE_URL.format(self.project_id, queue_name)) def purge_queue(self, queue_name: str): """Posts a request to purge the queue with the given name.""" logging.info("Purging [%s] in [%s]", queue_name, self.project_id) self.do_post_request( self.PURGE_QUEUE_URL.format(self.project_id, queue_name)) def pause_and_purge_queues(self): """Pauses and purges Direct Ingest queues for the specified project.""" for queue_name in self.QUEUES_TO_PAUSE: self.pause_queue(queue_name) self.purge_queue(queue_name) def get_files_to_move_from_path(self, gs_dir_path: str) -> List[str]: """Returns files directly in the given directory that should be moved back into the ingest directory. """ file_paths = gsutil_ls(gs_dir_path) result = [] for file_path in file_paths: _, file_name = os.path.split(file_path) if re.match(self.FILE_TO_MOVE_RE, file_name): if not self.file_filter or re.search(self.file_filter, file_name): result.append(file_path) with self.mutex: if self.collect_progress: self.collect_progress.next() return result def move_file(self, original_file_path: str): """Moves a file at the given path into the ingest directory, updating the name to always have an prefix of 'unprocessed'. Logs the file move, which will later be written to a log file. If in dry_run mode, merely logs the move, but does not execute it. """ new_file_path = self.build_moved_file_path(original_file_path) if not self.dry_run: gsutil_mv(original_file_path, new_file_path) with self.mutex: self.moves_list.append((original_file_path, new_file_path)) if self.move_progress: self.move_progress.next() def build_moved_file_path(self, original_file_path: str) -> str: """Builds the desired path for the given file in the ingest bucket, changing the prefix to 'unprocessed' as is necessary. """ path_as_unprocessed = to_normalized_unprocessed_file_path_from_normalized_path( original_file_path, file_type_override=self.destination_file_type) _, file_name = os.path.split(path_as_unprocessed) if not re.match(self.FILE_TO_MOVE_RE, file_name): raise ValueError(f"Invalid file name {file_name}") return os.path.join('gs://', self.ingest_bucket.abs_path(), file_name) def write_moves_to_log_file(self): self.moves_list.sort() with open(self.log_output_path, 'w') as f: if self.dry_run: template = "DRY RUN: Would move {} -> {}\n" else: template = "Moved {} -> {}\n" f.writelines( template.format(original_path, new_path) for original_path, new_path in self.moves_list)
def import_signal_mask(conn): """ Export pictures of the syllable with fundamentals :param conn: :return: """ cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute('SELECT id, name, maxfreq, dy FROM songdata s') songs_data = cur.fetchall() song_info = {} for song in songs_data: song_name = song['name'] song_info[song_name] = (song['id'], song['maxfreq'], song['dy']) segments_info = Segment.objects \ .filter(audio_file__name__in=song_info.keys()) \ .values_list('id', 'audio_file__name', 'start_time_ms', 'end_time_ms') n = len(segments_info) bar = Bar('Importing segments ...', max=n) for seg_id, song_name, start, end in segments_info: if song_name not in song_info: continue song_id, nyquist, fbin = song_info[song_name] cur.execute('select starttime, endtime, songid from syllable where songid={} and starttime<={} and endtime>={}' ' order by starttime'.format(song_id, start, end)) syl_rows = cur.fetchall() if len(syl_rows) == 0: warning('Song #{} {} doesn\'t have a syllable at position {}:{}'.format(song_id, song_name, start, end)) continue if len(syl_rows) > 1: warning('Song #{} {} has more than one syllable at position {}:{}. Db Syllable #{}' .format(song_id, song_name, start, end, seg_id)) for syl_idx, syl_row in enumerate(syl_rows): syl_starttime = syl_row['starttime'] syl_endtime = syl_row['endtime'] cur.execute('select starttime, timelength, fundfreq, gapbefore, gapafter, maxf, dy,' 'overallpeakfreq1, overallpeakfreq2 ' 'from element where songid={} and starttime >= {} and (starttime + timelength) <= {}' .format(song_id, syl_starttime, syl_endtime)) el_rows = cur.fetchall() if len(el_rows) == 0: warning('Syllable #{} starttime={} endtime={} of song: "{}" doesn\'t enclose any syllable.' .format(1, syl_starttime, syl_endtime, song_name)) continue syl_starttime = el_rows[0]['starttime'] syl_endtime = get_syllable_end_time(el_rows) if nyquist == 0: nyquist = el_rows[0]['maxf'] if fbin == 0: fbin = el_rows[0]['dy'] width = int(syl_endtime - syl_starttime) + 1 height = int(nyquist / fbin) img_data_rgb = np.ones((height, width, 3), dtype=np.uint8) * 255 syl_max_ff = 0 syl_min_ff = 999999 syl_combined_ff = None for el_idx, el in enumerate(el_rows): # signal = list(map(int, el['signal'].strip().split(' '))) fundfreq = np.array(el['fundfreq'].strip().split( ' '), dtype='|S32').astype(np.float) el_max_ff = fundfreq[0] el_min_ff = fundfreq[1] # the first 4 numbers of fundfreq are: max, min, ? (no idea) and ? (no idea), so we ignore them fundfreq = fundfreq[4:] if el_idx == 0: syl_combined_ff = fundfreq else: syl_combined_ff = np.concatenate( (syl_combined_ff, fundfreq)) fundfreq = (fundfreq / nyquist * height).astype(np.int) i = 0 ff_row_idx = 0 while i < len(signal): num_data = signal[i] img_col_idx = signal[i + 1] - syl_starttime # Draw the mask for j in range(2, num_data, 2): _signal_segment_end = signal[i + j] _signal_segment_start = signal[i + j + 1] img_data_rgb[_signal_segment_start:_signal_segment_end, img_col_idx, :] \ = COLOURS[el_idx % len(COLOURS)] # Add the fundamental (red lines) if ff_row_idx < len(fundfreq): img_row_idx = height - fundfreq[ff_row_idx] - 1 img_row_idx_padded_low = max(0, img_row_idx - 2) img_row_idx_padded_high = img_row_idx + 4 - (img_row_idx - img_row_idx_padded_low) img_data_rgb[img_row_idx_padded_low:img_row_idx_padded_high, img_col_idx, :] = FF_COLOUR ff_row_idx += 1 i += (num_data + 1) syl_max_ff = max(syl_max_ff, el_max_ff) syl_min_ff = min(syl_min_ff, el_min_ff) syl_mean_ff = np.mean(syl_combined_ff) Segment.objects.filter(id=seg_id).update(mean_ff=syl_mean_ff) Segment.objects.filter(id=seg_id).update(max_ff=syl_max_ff) Segment.objects.filter(id=seg_id).update(min_ff=syl_min_ff) img = Image.fromarray(img_data_rgb) thumbnail_width = int(img.size[0]) thumbnail_height = int(img.size[1] * 0.3) img = img.resize((thumbnail_width, thumbnail_height)) if syl_idx > 0: warning('Syl_idx > 0') file_path = spect_mask_path('{}_{}'.format(seg_id, syl_idx)) else: file_path = spect_mask_path(seg_id) ensure_parent_folder_exists(file_path) img.save(file_path, format='PNG') bar.next() bar.finish()
class MoveFilesToDeprecatedController: """Class with functionality to move files to deprecated folder with proper formatting.""" def __init__( self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str], ): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path_for_file_type = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id, ))) self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}" f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None def run(self) -> None: """Main function that will execute the move to deprecated.""" # TODO(#3666): Update this script to make updates to our Operations db and BigQuery (if necessary). # For now we print these messages to check if appropriate data has been deleted from operations db. if self.dry_run: if self.file_type == GcsfsDirectIngestFileType.RAW_DATA: logging.info( "[DRY RUN] All associated rows from our postgres table `direct_ingest_raw_file_metadata` " "and BigQuery dataset `%s_raw_data` must be deleted before moving these " "files to a deprecated location. Make sure you have done this before moving these files.", self.region_code, ) elif self.file_type == GcsfsDirectIngestFileType.INGEST_VIEW: logging.info( "[DRY RUN] All associated rows from our postgres table `direct_ingest_ingest_file_" "metadata` must be deleted before moving these files to a deprecated location. " "Make sure you have done this before moving these files.") else: if self.file_type == GcsfsDirectIngestFileType.RAW_DATA: i = input( "All associated rows from our postgres table `direct_ingest_raw_file_metadata` " f"and BigQuery dataset `{self.region_code}_raw_data` must be deleted before moving these " "files to a deprecated location.\n Have you already done so? [y/n]: " ) if i.upper() != "Y": return elif self.file_type == GcsfsDirectIngestFileType.INGEST_VIEW: i = input( "All associated rows from our postgres table `direct_ingest_ingest_file_metadata` " "must be deleted before moving these files to a deprecated location.\n" "Have you already done so? [y/n]: ") if i.upper() != "Y": return destination_dir_path = os.path.join( self.region_storage_dir_path_for_file_type.abs_path(), "deprecated", f"deprecated_on_{date.today()}", f"{str(self.file_type.value)}/", ) if self.dry_run: logging.info( "[DRY RUN] Moving files from [%s] to [%s]", self.region_storage_dir_path_for_file_type.abs_path(), destination_dir_path, ) else: i = input( f"Moving files from [{self.region_storage_dir_path_for_file_type.abs_path()}] to " f"[{destination_dir_path}] - continue? [y/n]: ") if i.upper() != "Y": return files_to_move = self._get_files_to_move() if self.dry_run: logging.info("[DRY RUN] Found [%d] files to move", len(files_to_move)) else: i = input(f"Found [{len(files_to_move)}] files to move - " f"continue? [y/n]: ") if i.upper() != "Y": return self._execute_move(files_to_move) self._write_move_to_log_file() if self.dry_run: logging.info( "DRY RUN: See results in [%s].\n" "Rerun with [--dry-run False] to execute move.", self.log_output_path, ) else: logging.info("Move complete! See results in [%s].\n", self.log_output_path) def _get_files_to_move(self) -> List[str]: """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified""" subdirs = gsutil_get_storage_subdirs_containing_file_types( storage_bucket_path=GcsfsDirectoryPath.from_bucket_and_blob_name( self.region_storage_dir_path_for_file_type.bucket_name, self.region_code).abs_path(), file_type=self.file_type, lower_bound_date=self.start_date_bound, upper_bound_date=self.end_date_bound, ) result = [] for subdir_path in subdirs: from_paths = gsutil_ls(f"{subdir_path}*.csv") for from_path in from_paths: _, file_name = os.path.split(from_path) if re.match(INGESTED_FILE_REGEX, file_name): if not self.file_filter or re.search( self.file_filter, file_name): result.append(from_path) return result def _write_move_to_log_file(self) -> None: self.move_list.sort() with open(self.log_output_path, "w") as f: if self.dry_run: template = "DRY RUN: Would move {} -> {}\n" else: template = "Moved {} -> {}\n" f.writelines( template.format(original_path, new_path) for original_path, new_path in self.move_list) def _move_files_for_date(self, from_uri: str) -> None: """Function that loops through each list of files to move and moves them to the deprecated folder in accordance with the date they were received and the date they were deprecated.""" curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") to_uri = os.path.join( "gs://", self.region_storage_dir_path_for_file_type.bucket_name, self.region_code, "deprecated", f"deprecated_on_{date.today()}", str(self.file_type.value), new_date_format, curr_gcsfs_file_path.file_name, ) if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next() def _execute_move(self, files_to_move: List[str]) -> None: self.move_progress = Bar("Moving files to deprecated...", max=len(files_to_move)) thread_pool = ThreadPool(processes=12) thread_pool.map(self._move_files_for_date, files_to_move) self.move_progress.finish()
def run(): # calculate class rank StudentExamRecord.objects.filter(sub_exam__course_id=60).delete() StudentExamRecord.objects.filter(class_rank__gt=0).update(class_rank=0) class_exam_records = ClassExamRecord.objects.filter( attend_count__gt=0, stu_class__isnull=False ).order_by( 'stu_class_id', 'sub_exam__exam_id' ).exclude() bar = Bar('Class Ranking', max=len(class_exam_records)) exam_id = None total_score_counter = {} for class_exam_record in class_exam_records: bar.next() if not exam_id: exam_id = class_exam_record.sub_exam.exam_id if exam_id != class_exam_record.sub_exam.exam_id: sorted_records = sorted(total_score_counter.items(), key=lambda d: d[1], reverse=True) for index, record in enumerate(sorted_records): student_in_db = Student.objects.get(id=record[0]) sub_exam_in_db = SubExam.objects.get(exam_id=exam_id, course_id=60) StudentExamRecord.objects.create( student=student_in_db, sub_exam=sub_exam_in_db, score=record[1], class_rank=index + 1 ) total_score_counter = {} exam_id = class_exam_record.sub_exam.exam_id stu_class = class_exam_record.stu_class students = stu_class.studentrecord_set.values( 'student_id' ).distinct().values_list('student_id', flat=True) sub_exam_id = class_exam_record.sub_exam_id student_exam_records = StudentExamRecord.objects.filter( sub_exam_id=sub_exam_id, student_id__in=students, score__gte=0 ).order_by('deng_di') for index, student_record in enumerate(student_exam_records): student_record.class_rank = index + 1 student_record.save() student_id = student_record.student_id if student_id not in total_score_counter: total_score_counter[student_id] = student_record.score continue total_score_counter[student_id] += student_record.score if exam_id: bar.max += 1 sorted_records = sorted(total_score_counter.items(), key=lambda d: d[1], reverse=True) for index, record in enumerate(sorted_records): student_in_db = Student.objects.get(id=record[0]) sub_exam_in_db = SubExam.objects.get(exam_id=exam_id, course_id=60) StudentExamRecord.objects.create( student=student_in_db, sub_exam=sub_exam_in_db, score=record[1], class_rank=index + 1 ) bar.next() bar.finish()
def train(model, data, batch_size=128, learning_rate=FLAGS.learning_rate, log_dir='./log', checkpoint_dir='./checkpoint', num_epochs=-1): # tf Graph input with tf.device('/cpu:0'): with tf.name_scope('data'): x, yt = data.generate_batches(batch_size) global_step = tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False) if FLAGS.gpu: device_str = '/gpu:' + str(FLAGS.device) else: device_str = '/cpu:0' with tf.device(device_str): y = model(x, is_training=True) # Define loss and optimizer with tf.name_scope('objective'): loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt, logits=y)) accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32)) opt = tf.contrib.layers.optimize_loss( loss, global_step, learning_rate, 'Adam', gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, #moving_average_decay=0.9, learning_rate_decay_fn=learning_rate_decay_fn, update_ops=None, variables=None, name=None) #grads = opt.compute_gradients(loss) #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # loss_avg ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step, name='average') ema_op = ema.apply([loss, accuracy] + tf.trainable_variables()) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) loss_avg = ema.average(loss) tf.summary.scalar('loss/training', loss_avg) accuracy_avg = ema.average(accuracy) tf.summary.scalar('accuracy/training', accuracy_avg) check_loss = tf.check_numerics(loss, 'model diverged: loss->nan') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss) updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([opt]): train_op = tf.group(*updates_collection) if FLAGS.summary: add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg], activation_list=tf.get_collection( tf.GraphKeys.ACTIVATIONS), var_list=tf.trainable_variables()) # grad_list=grads) summary_op = tf.summary.merge_all() # Configure options for session gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options, )) saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('checkpoint is restored.') else: print('No checkpoint file found') sess.run(tf.global_variables_initializer()) #sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_batches = data.size[0] / batch_size summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph) epoch = 0 print('num of trainable paramaters: %d' % count_params(tf.trainable_variables())) while epoch != num_epochs: epoch += 1 curr_step = 0 # Initializing the variables #with tf.Session() as session: # print(session.run(ww)) print('Started epoch %d' % epoch) bar = Bar('Training', max=num_batches, suffix='%(percent)d%% eta: %(eta)ds') while curr_step < data.size[0]: _, loss_val = sess.run([train_op, loss]) curr_step += FLAGS.batch_size bar.next() step, acc_value, loss_value, summary = sess.run( [global_step, accuracy_avg, loss_avg, summary_op]) saver.save(sess, save_path=checkpoint_dir + '/model.ckpt', global_step=global_step) bar.finish() print('Finished epoch %d' % epoch) print('Training Accuracy: %.3f' % acc_value) print('Training Loss: %.3f' % loss_value) test_acc, test_loss = evaluate(model, FLAGS.dataset, batch_size=batch_size, checkpoint_dir=checkpoint_dir) # , # log_dir=log_dir) print('Test Accuracy: %.3f' % test_acc) print('Test Loss: %.3f' % test_loss) summary_out = tf.Summary() summary_out.ParseFromString(summary) summary_out.value.add(tag='accuracy/test', simple_value=test_acc) summary_out.value.add(tag='loss/test', simple_value=test_loss) summary_writer.add_summary(summary_out, step) summary_writer.flush() # When done, ask the threads to stop. coord.request_stop() coord.join(threads) coord.clear_stop() summary_writer.close()
def excel_creator(dic_list, new=True): row_enumerate = 2 print("Start to create xlx file") list_of_items = [] if new: book = Workbook() sheet = book.active else: # dic_list=dic_list[:-1] book = load_workbook("Med_list.xlsx") try: shl = (book.sheetnames) if len(shl) > 0: for h in shl: if h != "Sheet": book.remove(book[h]) except: pass book.create_sheet("Sheet_row") sheet = book["Sheet_row"] if new: sheet.append([ "номер п/п", "Номер РКИ", "Дата создания РКИ", "Наименование ЛП", "Организация, проводящая КИ", "Страна разраб-ка", "Организация, привлеченная разработчиком ЛП", "Начало (дата)", "Окончание (дата)", "№ протокола", "Протокол", "Фаза КИ", "Вид КИ", "Колич. мед. орг-й", "Колич. пациент.", "Области применения", "Состояние", "Перечень медицинских организаций, в которых предполагается проведение клинических исследований" ]) for index, d in enumerate(dic_list): for i in range(d["cells"]): step = d["cells"] list_of_items.append((d["n_n"])) list_of_items.append(d["rki"]) list_of_items.append(d["date"]) list_of_items.append(d["name_lp"]) list_of_items.append(d["organization"]) list_of_items.append(d["country"]) list_of_items.append(d["organization_lp"]) list_of_items.append(d["date_start"]) list_of_items.append(d["date_end"]) list_of_items.append(d["n_protokol"]) list_of_items.append(d["protokol"]) list_of_items.append(d["phase"]) list_of_items.append(d["view"]) list_of_items.append(d["n_orgs"]) list_of_items.append(d["n_patient"]) list_of_items.append(d["type"]) list_of_items.append(d["status"]) # list_of_items.append(func(d["clinc_listsl"])) list_of_items.append((d["clinc_listsl"][i])) list_of_items.append(d["cells"]) if d["rki"] != "": sheet.append(list_of_items) list_of_items = [] r = len(sheet['A']) print("Create Sheet_row") if new == False: sheet_d = book["Sheet_row"] sheet = book["Sheet"] sheet.insert_rows(idx=2, amount=r) for ind, row in enumerate(sheet_d.rows): for col, k in enumerate(row): sheet.cell(row=ind + 2, column=col + 1).value = k.value book.create_sheet("KI") sheet_ki = book["KI"] print("Create KI") for inx, row in enumerate(sheet.rows): for cl, t in enumerate(row): sheet_ki.cell(row=inx + 1, column=cl + 1).value = t.value if sheet_ki.cell(row=inx + 1, column=cl + 1).value == 1: sheet_ki.row_dimensions[inx + 1].height = 60 # if step == 2: # sheet.row_dimensions[row_enumerate].height = 30 # sheet.row_dimensions[row_enumerate + 1].height = 30 print("Alignment 1 iter KI") bar = Bar('Processing', max=len(sheet_ki['A'])) checker = sheet_ki.cell(row=2, column=19).value checker2 = sheet_ki.cell(row=2, column=2).value for liner, i in enumerate(range(2, len(sheet_ki['A']) + 1)): if sheet_ki.cell(row=i, column=19).value == checker and sheet_ki.cell( row=i, column=2).value == checker2: pass else: for col in range(1, 18): sheet_ki.merge_cells(start_row=i - checker, start_column=col, end_row=i - 1, end_column=col) checker = sheet_ki.cell(row=i, column=19).value checker2 = sheet_ki.cell(row=2, column=2).value bar.next() bar.finish() for col in range(1, 18): sheet_ki.merge_cells(start_row=i - checker + 1, start_column=col, end_row=i, end_column=col) print("Alignment2 iter KI") bar = Bar('Processing', max=len(sheet_ki['A'])) for row in sheet_ki.rows: for k in row: # print(row[0:-1]) k.alignment = Alignment(vertical="center", horizontal="center", wrapText=True) k.font = Font(size="9") bar.next() bar.finish() print("Alignment 3 iter KI") bar = Bar('Processing', max=len(sheet_ki['A'])) for row in sheet_ki.rows: for k in row[-2:]: k.alignment = Alignment(vertical="center", horizontal="left", wrapText=True) bar.next() bar.finish() sheet_ki.column_dimensions[get_column_letter(4)].width = 25 sheet_ki.column_dimensions[get_column_letter(5)].width = 35 sheet_ki.column_dimensions[get_column_letter(7)].width = 50 sheet_ki.column_dimensions[get_column_letter(11)].width = 60 sheet_ki.column_dimensions[get_column_letter(16)].width = 11 sheet_ki.column_dimensions[get_column_letter(17)].width = 11 sheet_ki.column_dimensions[get_column_letter(18)].width = 1000 book.save("Med_list.xlsx") print("file was done")
def upload_self(api_base_url='', token='', source_file='', dest_path='', chunksize=10247680): """str, str, str, int, int->Bool Upload a file via the API, instead of the SDK. Ref: https://dev.onedrive.com/items/upload_post.htm """ ## get upload URL if not dest_path.endswith('/'): dest_path += '/' # Prepare API call dest_path = path_to_remote_path(dest_path) + '/' + path_to_name( source_file) info_json = json.dumps({ 'item': { '@name.conflictBehavior': 'rename', 'name': path_to_name(source_file) } }) api_url = api_base_url + 'drive/root:{dest_path}:/upload.createSession'.format( dest_path=dest_path) req = requests.post(api_url, data=info_json, headers={ 'Authorization': 'bearer {access_token}'.format(access_token=token), 'content-type': 'application/json' }) if req.status_code > 201: print(req.json()['error']['message']) return False req = convert_utf8_dict_to_dict(req.json()) uploadUrl = req['uploadUrl'] # filesize cannot > 10GiB file_size = os.path.getsize(source_file) # print(file_size) range_list = [[i, i + chunksize - 1] for i in range(0, file_size, chunksize)] range_list[-1][-1] = file_size - 1 # Upload with a progress bar bar = Bar('Uploading', max=len(range_list), suffix='%(percent).1f%% - %(eta)ds') bar.next() # nessesery to init the Bar # Session reuse when uploading, hopefully will kill some overhead requests_session = requests.Session() for i in range_list: upload_one_piece(uploadUrl=uploadUrl, token=token, source_file=source_file, range_this=i, file_size=file_size, requests_session=requests_session) bar.next() bar.finish() return True
def build_model(self): image_batch = layers.data(name='image_batch', shape=[-1, 1, 28, 28], dtype='float32') label_batch = layers.data(name='label_batch', shape=[-1, 1], dtype='int64') noise = layers.data(name='noise', shape=[-1, self.cfg.latent_size], dtype='float32') sampled_labels = layers.data(name='sampled_labels', shape=[-1, 1], dtype='int64') x = layers.data(name='x', shape=[-1, 1, 28, 28], dtype='float32') y = layers.data(name='y', shape=[-1, 1], dtype='float32') aux_y = layers.data(name='aux_y', shape=[-1, 1], dtype='int64') trick = layers.data(name='trick', shape=[-1, 1], dtype='float32') g_train = GTrain(sampled_labels, noise, trick, self.cfg) d_train = DTrain(x, y, aux_y, self.cfg) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) g_train_prog = fluid.CompiledProgram(g_train.program) d_train_prog = fluid.CompiledProgram(d_train.program) train_history = defaultdict(list) test_history = defaultdict(list) for epoch in range(1, self.cfg.epochs + 1): print('Epoch {}/{}'.format(epoch, self.cfg.epochs)) num_batches = int(np.ceil(60000 / float(self.cfg.batch_size))) progress_bar = Bar('Training', max=num_batches) epoch_gen_loss = [] epoch_disc_loss = [] train_reader = paddle.batch(paddle.reader.shuffle(mnist.train(), buf_size=60000), batch_size=self.cfg.batch_size, drop_last=True) test_reader = mnist.test() step = 0 for i, data in enumerate(train_reader()): image_batch = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') label_batch = np.array([[x[1]] for x in data]).astype('int64') if len(image_batch) != self.cfg.batch_size: continue # generate a new batch of noise noise_np = np.random.uniform( -1, 1, (self.cfg.batch_size, self.cfg.latent_size)).astype('float32') # sample some labels from p_c sampled_labels_np = np.random.randint( 0, self.cfg.num_classes, self.cfg.batch_size).astype('int64') sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1) # generate a batch of fake images, using the generated labels as # a conditioner. We reshape the sampled labels to be # (self.cfg.batch_size, 1) so that we can feed them into the # embedding layer as a length one sequence generated_images = exe.run(g_train.infer_program, feed={ 'sampled_labels': sampled_labels_np, 'noise': noise_np }, fetch_list=[g_train.fake_img])[0] x_np = np.concatenate((image_batch, generated_images)) # use one-sided soft real/fake labels # Salimans et al., 2016 # https://arxiv.org/pdf/1606.03498.pdf (Section 3.4) soft_zero, soft_one = 0, 0.95 y_np = np.array([[soft_one]] * len(image_batch) + [[soft_zero]] * len(image_batch)).astype('float32') aux_y_np = np.concatenate((label_batch, sampled_labels_np), axis=0) # see if the discriminator can figure itself out... epoch_disc_loss.append( exe.run(d_train_prog, feed={ 'x': x_np, 'y': y_np, 'aux_y': aux_y_np }, fetch_list=[d_train.loss])[0]) # make new noise. we generate 2 * batch size here such that we have # the generator optimize over an identical number of images as the # discriminator noise_np = np.random.uniform( -1, 1, (2 * self.cfg.batch_size, self.cfg.latent_size)).astype('float32') sampled_labels_np = np.random.randint( 0, self.cfg.num_classes, 2 * self.cfg.batch_size).astype('int64') sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1) # we want to train the generator to trick the discriminator # For the generator, we want all the {fake, not-fake} labels to say # not-fake trick_np = np.array([[soft_one]] * 2 * self.cfg.batch_size).astype('float32') epoch_gen_loss.append( exe.run(g_train_prog, feed={ 'sampled_labels': sampled_labels_np, 'noise': noise_np, 'trick': trick_np }, fetch_list=[g_train.loss])[0]) step += 1 progress_bar.next() progress_bar.finish() print('Testing for epoch {}'.format(epoch)) # evaluate the testing loss here # generate a new batch of noise noise_np = np.random.uniform( -1, 1, (self.cfg.test_size, self.cfg.latent_size)).astype('float32') # sample some labels from p_c and generate images from them sampled_labels_np = np.random.randint( 0, self.cfg.num_classes, self.cfg.test_size).astype('int64') sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1) generated_images = exe.run(g_train.infer_program, feed={ 'sampled_labels': sampled_labels_np, 'noise': noise_np }, fetch_list=[g_train.fake_img])[0] x_test, y_test = [], [] for data in test_reader(): x_test.append(np.reshape(data[0], [1, 28, 28])) y_test.append([data[1]]) if len(x_test) >= self.cfg.test_size: break x_test = np.array(x_test).astype('float32') y_test = np.array(y_test).astype('int64') x_np = np.concatenate((x_test, generated_images)) y_np = np.array([[1]] * self.cfg.test_size + [[0]] * self.cfg.test_size).astype('float32') aux_y_np = np.concatenate((y_test, sampled_labels_np), axis=0) # see if the discriminator can figure itself out... discriminator_test_loss = exe.run( d_train.infer_program, feed={ 'x': x_np, 'y': y_np, 'aux_y': aux_y_np }, fetch_list=[d_train.unweighted_loss])[0][0] discriminator_train_loss = np.mean(np.array(epoch_disc_loss)) # make new noise noise_np = np.random.uniform( -1, 1, (2 * self.cfg.test_size, self.cfg.latent_size)).astype('float32') sampled_labels_np = np.random.randint( 0, self.cfg.num_classes, 2 * self.cfg.test_size).astype('int64') sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1) trick_np = np.array([[1]] * 2 * self.cfg.test_size).astype('float32') generated_images = exe.run(g_train.infer_program, feed={ 'sampled_labels': sampled_labels_np, 'noise': noise_np }, fetch_list=[g_train.fake_img])[0] generator_test_loss = exe.run(d_train.infer_program, feed={ 'x': generated_images, 'y': trick_np, 'aux_y': sampled_labels_np }, fetch_list=[d_train.unweighted_loss ])[0][0] generator_train_loss = np.mean(np.array(epoch_gen_loss)) # generate an epoch report on performance train_history['generator'].append(generator_train_loss) train_history['discriminator'].append(discriminator_train_loss) test_history['generator'].append(generator_test_loss) test_history['discriminator'].append(discriminator_test_loss) print('train g loss', generator_train_loss) print('train d loss', discriminator_train_loss) print('test g loss', generator_test_loss) print('test d loss', discriminator_test_loss) # generate some digits to display num_rows = 4 noise_np = np.tile( np.random.uniform(-1, 1, (num_rows, self.cfg.latent_size)), (self.cfg.num_classes, 1)).astype('float32') sampled_labels_np = np.array([[i] * num_rows for i in range(self.cfg.num_classes) ]).reshape(-1, 1).astype('int64') generated_images = exe.run(g_train.infer_program, feed={ 'sampled_labels': sampled_labels_np, 'noise': noise_np }, fetch_list=[g_train.fake_img])[0] def save_images(generated_images, epoch): for i in range(len(generated_images)): fname = './data/image_epoch_%d_%d.jpeg' % (epoch, i) img = np.array( generated_images[i]).astype('float32').reshape( (28, 28)) img = img * 127.5 + 127.5 img = np.clip(img, 0, 255).astype('uint8') img = Image.fromarray(img, 'L') img.save(fname, format='JPEG') save_images(generated_images, epoch) with open('acgan-history.pkl', 'wb') as f: pickle.dump({'train': train_history, 'test': test_history}, f)
def verify_producer_performance(with_dr_cb=True): """ Time how long it takes to produce and delivery X messages """ conf = {'bootstrap.servers': bootstrap_servers} p = confluent_kafka.Producer(**conf) topic = 'test' msgcnt = 1000000 msgsize = 100 msg_pattern = 'test.py performance' msg_payload = (msg_pattern * int(msgsize / len(msg_pattern)))[0:msgsize] dr = MyTestDr(silent=True) t_produce_start = time.time() msgs_produced = 0 msgs_backpressure = 0 print('# producing %d messages to topic %s' % (msgcnt, topic)) if with_progress: bar = Bar('Producing', max=msgcnt) else: bar = None for i in range(0, msgcnt): try: if with_dr_cb: p.produce('test', value=msg_payload, callback=dr.delivery) else: p.produce('test', value=msg_payload) except BufferError as e: # Local queue is full (slow broker connection?) msgs_backpressure += 1 if bar is not None and (msgs_backpressure % 1000) == 0: bar.next(n=0) p.poll(0) continue if bar is not None and (msgs_produced % 5000) == 0: bar.next(n=5000) msgs_produced += 1 p.poll(0) t_produce_spent = time.time() - t_produce_start bytecnt = msgs_produced * msgsize if bar is not None: bar.finish() print('# producing %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \ (msgs_produced, bytecnt / (1024*1024), t_produce_spent, msgs_produced / t_produce_spent, (bytecnt/t_produce_spent) / (1024*1024))) print('# %d messages not produce()d due to backpressure (local queue full)' % msgs_backpressure) print('waiting for %d/%d deliveries' % (len(p), msgs_produced)) # Wait for deliveries p.flush() t_delivery_spent = time.time() - t_produce_start print('# producing %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \ (msgs_produced, bytecnt / (1024*1024), t_produce_spent, msgs_produced / t_produce_spent, (bytecnt/t_produce_spent) / (1024*1024))) # Fake numbers if not using a dr_cb if not with_dr_cb: print('# not using dr_cb') dr.msgs_delivered = msgs_produced dr.bytes_delivered = bytecnt print('# delivering %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \ (dr.msgs_delivered, dr.bytes_delivered / (1024*1024), t_delivery_spent, dr.msgs_delivered / t_delivery_spent, (dr.bytes_delivered/t_delivery_spent) / (1024*1024))) print('# post-produce delivery wait took %.3fs' % \ (t_delivery_spent - t_produce_spent))
def verify_consumer_performance(): """ Verify Consumer performance """ conf = {'bootstrap.servers': bootstrap_servers, 'group.id': uuid.uuid1(), 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'earliest' }} c = confluent_kafka.Consumer(**conf) def my_on_assign (consumer, partitions): print('on_assign:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.assign(partitions) def my_on_revoke (consumer, partitions): print('on_revoke:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.unassign() c.subscribe(["test"], on_assign=my_on_assign, on_revoke=my_on_revoke) max_msgcnt = 1000000 bytecnt = 0 msgcnt = 0 print('Will now consume %d messages' % max_msgcnt) if with_progress: bar = Bar('Consuming', max=max_msgcnt, suffix='%(index)d/%(max)d [%(eta_td)s]') else: bar = None while True: # Consume until EOF or error msg = c.poll(timeout=20.0) if msg is None: raise Exception('Stalled at %d/%d message, no new messages for 20s' % (msgcnt, max_msgcnt)) if msg.error(): if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF: # Reached EOF for a partition, ignore. continue else: raise confluent_kafka.KafkaException(msg.error()) bytecnt += len(msg) msgcnt += 1 if bar is not None and (msgcnt % 10000) == 0: bar.next(n=10000) if msgcnt == 1: t_first_msg = time.time() if msgcnt >= max_msgcnt: break if bar is not None: bar.finish() if msgcnt > 0: t_spent = time.time() - t_first_msg print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % \ (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent, (bytecnt / t_spent) / (1024*1024))) print('closing consumer') c.close()
def download_data(download_urls: list, area: str, driver_path: str, keys: dict, outdir: str): """ Function to instantiate web driver, stuff credentials, and repeately hit download urls """ # Define options for web driver chrome_options = webdriver.ChromeOptions() # Define download directory as outdir prefs = {"download.default_directory": outdir} # Apply options to chrome driver chrome_options.add_experimental_option("prefs", prefs) # Instantiate web driver driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) # Login url for Geoinsights platform geoinsights_url = "https://www.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2Fgeoinsights-portal%2F" # Access login url with webdriver driver.get(geoinsights_url) # Pause for page load (and cookie acceptance) time.sleep(2) # Try to accept cookies. On failure, pass try: driver.find_element_by_xpath('//*[@id="u_0_h"]').click() except Exception: pass # Add username in username form field driver.find_element_by_xpath('//*[@id="email"]').send_keys(keys["email"]) # Add password in password form field driver.find_element_by_xpath('//*[@id="pass"]').send_keys(keys["password"]) # Click login button driver.find_element_by_xpath('//*[@id="loginbutton"]').click() # Start download bar print("\n\n---------------------") bar = Bar("Downloading", max=len(download_urls)) # For each download url, download dataset for i, url in enumerate(download_urls): # Get time of download start download_start = datetime.timestamp(datetime.now()) # Access download url driver.get(url["url"]) # Wait for file to be downloaded latest_file = wait_for_download(download_start, outdir) # Rename file with formatted file name rename_file(latest_file, outdir, area, url["date"]) # Update progress bar bar.next() # Close progress bar bar.finish()
def run(self): self.hdf5_group = self.hdf5_file # TODO: split into train, val and test set prior_occ_dem = None progress_bar = None sample_idx = 0 self.logger.info("Start loading first chunk of msgpack") for chunk_idx, chunk in enumerate(self.unpacker): self.logger.info(f"Msgpack chunk {chunk_idx} is loaded") occ_dem_msgs = chunk["/ga_slam.localElevationMapMean"] occ_data_um_msgs = chunk["/ga_slam.localElevationMapVariance"] gt_dem_msgs = chunk["/ga_slam.globalElevationMapMean"] gt_data_um_msgs = chunk["/ga_slam.globalElevationMapVariance"] for msg in zip(occ_dem_msgs, occ_data_um_msgs, gt_dem_msgs, gt_data_um_msgs): occ_dem_msg, occ_data_um_msg, gt_dem_msg, gt_data_um_msg = msg time = occ_dem_msg["time"] h, w = occ_dem_msg["height"], occ_dem_msg["width"] occ_dem = np.array(occ_dem_msg["data"]) occ_dem = occ_dem.reshape((-1, int(np.sqrt(occ_dem.shape[0]))), order="F") occ_data_um = np.array(occ_data_um_msg["data"]) occ_data_um = occ_data_um.reshape( (-1, int(np.sqrt(occ_data_um.shape[0]))), order="F") gt_dem = np.array(gt_dem_msg["data"]) gt_dem = gt_dem.reshape((-1, int(np.sqrt(gt_dem.shape[0]))), order="F") gt_data_um = np.array(gt_data_um_msg["data"]) gt_data_um = gt_data_um.reshape( (-1, int(np.sqrt(gt_data_um.shape[0]))), order="F") res_grid = np.array([0.05, 0.05]) rel_position_z = occ_dem[int(occ_dem.shape[0] // 2), int(occ_dem.shape[1] // 2)] rel_position = np.array([0, 0, rel_position_z]) rel_attitude = Rotation.from_euler('zyx', [0, 0, 0]).as_quat() # self.visualize(sample_idx=sample_idx, res_grid=res_grid, rel_position=rel_position, # occ_dem=occ_dem, gt_dem=gt_dem, occ_data_um=occ_data_um, gt_data_um=gt_data_um) target_size_x = self.config.get("size", occ_dem.shape[0]) target_size_y = self.config.get("size", occ_dem.shape[1]) num_subgrids_x = int(np.floor(occ_dem.shape[0] / target_size_x)) num_subgrids_y = int(np.floor(occ_dem.shape[1] / target_size_y)) assert num_subgrids_x >= 1 and num_subgrids_y >= 1 if progress_bar is None: # we extrapolate the total maximum number of samples # by comparing the number of messages and size of the current chunk # TODO: I am not sure if this code is correct (for multiple chunks) file_size = os.path.getsize( self.config["msgpack_path"]) # in bytes self.total_num_samples = int( len(occ_dem_msgs) / self.unpacker.tell() * file_size) self.total_num_samples *= num_subgrids_x * num_subgrids_y # multiply with the number of subgrids progress_bar = Bar( f"Processing msgspack from {self.config['msgpack_path']}", max=self.total_num_samples) start_x = 0 for i in range(num_subgrids_x): stop_x = start_x + target_size_x start_y = 0 for j in range(num_subgrids_y): stop_y = start_y + target_size_y occ_dem_subgrid = occ_dem[start_x:stop_x, start_y:stop_y] occ_data_um_subgrid = occ_data_um[start_x:stop_x, start_y:stop_y] gt_dem_subgrid = gt_dem[start_x:stop_x, start_y:stop_y] gt_data_um_subgrid = gt_data_um[start_x:stop_x, start_y:stop_y] subgrid_delta_x = res_grid[0] * ( -occ_dem.shape[0] / 2 + start_x + target_size_x / 2) subgrid_delta_y = res_grid[1] * ( -occ_dem.shape[1] / 2 + start_y + target_size_y / 2) rel_position_subgrid_z = occ_dem_subgrid[ int(target_size_x // 2), int(target_size_y // 2)] rel_position_subgrid = np.array([ rel_position[0] + subgrid_delta_x, rel_position[1] + subgrid_delta_y, rel_position_subgrid_z ]) if np.isnan(occ_dem_subgrid).all(): # we skip because the DEM only contains occlusion (NaNs) start_y = stop_y progress_bar.next() continue if np.isnan(gt_dem_subgrid).all(): # we skip because the DEM only contains missing values (NaNs) pass # start_y = stop_y # progress_bar.next() # continue max_occ_ratio_thresh = self.config.get( "max_occlusion_ratio_threshold", 0.5) # we do not want to include the subgrid in the dataset if its occluded to more than 50% if np.isnan(occ_dem_subgrid).sum() > ( target_size_x * target_size_y * max_occ_ratio_thresh): start_y = stop_y progress_bar.next() continue if prior_occ_dem is not None: # we compute MSE and PSNR between the current occluded dem and the occluded dem from the prior timestamp prior_occ_dem_subgrid = prior_occ_dem[ start_x:stop_x, start_y:stop_y] occ_dem_subgrid_no_nan = np.nan_to_num( occ_dem_subgrid, copy=True, nan=0.0) prior_occ_dem_subgrid_no_nan = np.nan_to_num( prior_occ_dem_subgrid, copy=True, nan=0.0) mse = mse_loss_fct( input=torch.tensor(occ_dem_subgrid_no_nan), target=torch.tensor( prior_occ_dem_subgrid_no_nan)) data_min = np.min([ occ_dem_subgrid_no_nan, prior_occ_dem_subgrid_no_nan ]).item() data_max = np.max([ occ_dem_subgrid_no_nan, prior_occ_dem_subgrid_no_nan ]).item() psnr = psnr_from_mse_loss_fct(mse=mse, data_min=data_min, data_max=data_max) # we want to exclude dems which are too similar if psnr > self.config.get( "psnr_similarity_threshold", 50): start_y = stop_y progress_bar.next() continue self.res_grid.append(res_grid) self.rel_positions.append(rel_position_subgrid) self.rel_attitudes.append(rel_attitude) self.occ_dems.append(occ_dem_subgrid) self.occ_data_ums.append(occ_data_um_subgrid) self.gt_dems.append(gt_dem_subgrid) if self.initialized_datasets is False: super().create_base_datasets( self.hdf5_group, self.total_num_samples) self.hdf5_group.create_dataset( name=ChannelEnum.OCC_DEM.value, shape=(0, occ_dem_subgrid.shape[0], occ_dem_subgrid.shape[1]), maxshape=(self.total_num_samples, occ_dem_subgrid.shape[0], occ_dem_subgrid.shape[1])) self.hdf5_group.create_dataset( name=ChannelEnum.OCC_DATA_UM.value, shape=(0, occ_data_um_subgrid.shape[0], occ_data_um_subgrid.shape[1]), maxshape=(self.total_num_samples, occ_data_um_subgrid.shape[0], occ_data_um_subgrid.shape[1])) self.hdf5_group.create_dataset( name=ChannelEnum.GT_DEM.value, shape=(0, gt_dem_subgrid.shape[0], gt_dem_subgrid.shape[1]), maxshape=(self.total_num_samples, gt_dem_subgrid.shape[0], gt_dem_subgrid.shape[1])) if self.sample_idx % self.config.get( "save_frequency", 50) == 0: self.save_cache() self.visualize(sample_idx=sample_idx, res_grid=res_grid, rel_position=rel_position_subgrid, occ_dem=occ_dem_subgrid, gt_dem=gt_dem_subgrid, occ_data_um=occ_data_um_subgrid, gt_data_um=gt_data_um_subgrid) prior_occ_dem = occ_dem sample_idx += 1 start_y = stop_y progress_bar.next() start_x = stop_x self.save_cache() progress_bar.finish()
def extract_spectrogram(): """ Extract raw sepectrograms for all segments (Not the masked spectrogram from Luscinia) :return: """ audio_to_segs = {} for segment in Segment.objects.all(): audio_file = segment.audio_file if audio_file not in audio_to_segs: audio_to_segs[audio_file] = [(segment.id, segment.start_time_ms, segment.end_time_ms)] else: audio_to_segs[audio_file].append((segment.id, segment.start_time_ms, segment.end_time_ms)) n = len(audio_to_segs) bar = Bar('Exporting spects ...', max=n) for audio_file, seg_list in audio_to_segs.items(): count = 0 for seg_id, start, end in seg_list: seg_spect_path = spect_fft_path(seg_id, 'syllable') if os.path.isfile(seg_spect_path): count += 1 if count == len(seg_list): bar.next() continue filepath = wav_path(audio_file) fs, sig = wav_2_mono(filepath) duration_ms = len(sig) * 1000 / fs _, _, s = signal.stft(sig, fs=fs, window=window, noverlap=noverlap, nfft=window_size, return_onesided=True) file_spect = np.abs(s * scale) height, width = np.shape(file_spect) file_spect = np.flipud(file_spect) try: file_spect = np.log10(file_spect) file_spect = ((file_spect - global_min_spect_pixel) / interval64) file_spect[np.isinf(file_spect)] = 0 file_spect = file_spect.astype(np.int) file_spect = file_spect.reshape((width * height,), order='C') file_spect[file_spect >= 64] = 63 file_spect_rgb = np.empty((height, width, 3), dtype=np.uint8) file_spect_rgb[:, :, 0] = cm_red[file_spect].reshape( (height, width)) * 255 file_spect_rgb[:, :, 1] = cm_green[file_spect].reshape( (height, width)) * 255 file_spect_rgb[:, :, 2] = cm_blue[file_spect].reshape( (height, width)) * 255 file_spect_img = Image.fromarray(file_spect_rgb) file_spect_path = spect_fft_path(audio_file.id, 'song') ensure_parent_folder_exists(file_spect_path) if not os.path.isfile(file_spect_path): file_spect_img.save(file_spect_path, format='PNG') for seg_id, start, end in seg_list: roi_start = int(start / duration_ms * width) roi_end = int(np.ceil(end / duration_ms * width)) seg_spect_rgb = file_spect_rgb[:, roi_start:roi_end, :] seg_spect_img = Image.fromarray(seg_spect_rgb) seg_spect_path = spect_fft_path(seg_id, 'syllable') ensure_parent_folder_exists(seg_spect_path) if not os.path.isfile(seg_spect_path): seg_spect_img.save(seg_spect_path, format='PNG') except Exception as e: warning('Error occured at song id: {}'.format(audio_file.id)) raise e bar.next() bar.finish()
def verify_stats_cb(): """ Verify stats_cb """ def stats_cb(stats_json_str): global good_stats_cb_result stats_json = json.loads(stats_json_str) if topic in stats_json['topics']: app_offset = stats_json['topics'][topic]['partitions']['0']['app_offset'] if app_offset > 0: print("# app_offset stats for topic %s partition 0: %d" % (topic, app_offset)) good_stats_cb_result = True conf = {'bootstrap.servers': bootstrap_servers, 'group.id': uuid.uuid1(), 'session.timeout.ms': 6000, 'error_cb': error_cb, 'stats_cb': stats_cb, 'statistics.interval.ms': 200, 'default.topic.config': { 'auto.offset.reset': 'earliest' }} c = confluent_kafka.Consumer(**conf) c.subscribe([topic]) max_msgcnt = 1000000 bytecnt = 0 msgcnt = 0 print('Will now consume %d messages' % max_msgcnt) if with_progress: bar = Bar('Consuming', max=max_msgcnt, suffix='%(index)d/%(max)d [%(eta_td)s]') else: bar = None while not good_stats_cb_result: # Consume until EOF or error msg = c.poll(timeout=20.0) if msg is None: raise Exception('Stalled at %d/%d message, no new messages for 20s' % (msgcnt, max_msgcnt)) if msg.error(): if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF: # Reached EOF for a partition, ignore. continue else: raise confluent_kafka.KafkaException(msg.error()) bytecnt += len(msg) msgcnt += 1 if bar is not None and (msgcnt % 10000) == 0: bar.next(n=10000) if msgcnt == 1: t_first_msg = time.time() if msgcnt >= max_msgcnt: break if bar is not None: bar.finish() if msgcnt > 0: t_spent = time.time() - t_first_msg print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent, (bytecnt / t_spent) / (1024*1024))) print('closing consumer') c.close()
def import_syllables(conn): """ :param conn: the database connection :return: """ cur = conn.cursor() el_cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Import syllables for all songs cur.execute('SELECT sg.name, s.starttime, s.endtime, w.songid FROM syllable s ' 'JOIN wavs w ON s.songid=w.songid ' 'JOIN songdata sg ON w.songid=sg.id ORDER BY w.filename, s.starttime') song_syllable_rows = cur.fetchall() songs_2_syllables = {} # Song #79 PKI_2017_02_25_WHW028_01_M.EX..PipeClicksGrowlcough.wav has more than one syllable at position 1124:1136. # Db Syllable #2924 for row in song_syllable_rows: song_name = row[0] syl_starttime = row[1] syl_endtime = row[2] song_id = row[3] el_cur.execute('select starttime, timelength from element where songid={} and starttime >= {} ' 'and (starttime + timelength) <= {} order by starttime'.format(song_id, syl_starttime, syl_endtime)) el_rows = el_cur.fetchall() if len(el_rows) == 0: warning('Syllable with starttime={} endtime={} of song: "{}" doesn\'t enclose any syllable.' .format(syl_starttime, syl_endtime, song_name)) continue real_syl_starttime = el_rows[0]['starttime'] real_syl_endtime = utils.get_syllable_end_time(el_rows) syllable = (real_syl_starttime, real_syl_endtime) if song_name not in songs_2_syllables: syllables = [] songs_2_syllables[song_name] = syllables syllables.append(syllable) # delete all existing manual segmentation: Segment.objects.filter(audio_file__name__in=songs_2_syllables.keys()).delete() bar = Bar('Importing syllables ...', max=len(songs_2_syllables)) for song in songs_2_syllables: syllables = songs_2_syllables[song] audio_file = AudioFile.objects.filter(name=song).first() if audio_file is None: warning('File {} has not been imported. Please run import_luscinia_songs again.' ' Ignore for now'.format(song)) continue for syllable in syllables: segment = Segment() segment.start_time_ms = syllable[0] segment.end_time_ms = syllable[1] segment.audio_file = audio_file segment.save() segment.tid = segment.id segment.save() # print('Processed song {}'.format(song)) bar.next() bar.finish()
def query_to_tables(query, results_limit, output_path, result_order=None, input_csv=None): """ Takes in a search query as a sting, the amount of results you want returned as a string, the path you want to save to as a string, and optionally, the order of your results as a string. Generates a folder within the folder you specify and populates it with 4 spreadsheets containing the docket data from your search. """ # We convert the amount of results the user wants to an integer so we can work with the number. if input_csv == None: results_limit = int(results_limit) def fill_docketInformation(result, docket): """ This nested function populates the docketInformation dataframe. """ if not 'info' in docket: return # We loop through all the keys present in the dockets info dictionary. for key in docket['info']: # We create the new row we want to add as a dictionary. # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in # case the key doesn't exist. new_docketInformation_row = { 'Docket Number': result['docket'], 'Court Name': result['court'], 'Case Title': docket['info'].get('title', result.get("title", None)), 'Case Info Field': key, 'Case Info Values': docket['info'][key], } # We append the global dataframe with the row we want represented as a dictionary. # ignore_index=True specifies that we don't want to generate an index column. global docketInformation appender = docketInformation.append(new_docketInformation_row, ignore_index=True) # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated. # We replace the original with the new version so our changes get saved. docketInformation = appender def fill_docketEntries(result, docket): """ This nested function populates the docketEntries dataframe. """ # We loop through each dictionary within the docket_report list if not 'docket_report' in docket: print(docket) return for document in docket['docket_report']: # We create the new row we want to add as a dictionary. # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in # case the key doesn't exist. new_docketEntries_row = { 'Docket Number': result['docket'], 'Court Name': result['court'], 'Case Title': docket['info'].get('title', result.get("title", None)), 'Docket Entry Date': document.get('entry_date', None), 'Docket Entry Numbers': document.get('number', None), 'Docket Entry Contents': removehtml(document.get('contents', None)), } # We append the global dataframe with the row we want represented as a dictionary. # ignore_index=True specifies that we don't want to generate an index column. global docketEntries appender = docketEntries.append(new_docketEntries_row, ignore_index=True) # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated. # We replace the original with the new version so our changes get saved. docketEntries = appender def fill_parties(result, docket): """ This nested function populates the parties dataframe. """ # The parties key is not always present in our response. if not 'parties' in docket: # If it's not present, we don't add to the dataframe and we exit the function. print(docket) return for party in docket.get('parties', None): # We create the new row we want to add as a dictionary. # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in # case the key doesn't exist. new_parties_row = { 'Docket Number': result.get('docket', None), 'Court Name': result.get('court', None), 'Case Title': docket['info'].get('title', result.get("title", None)), 'Party Name': party.get('name_normalized', party.get('name')), 'Party Type': party.get('type', None), } # We append the global dataframe with the row we want represented as a dictionary. # ignore_index=True specifies that we don't want to generate an index column. global parties appender = parties.append(new_parties_row, ignore_index=True) # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated. # We replace the original with the new version so our changes get saved. parties = appender def fill_attorneysAndFirms(result, docket): """ This nested function populates the attorneysAndFirms dataframe. """ # The parties key is not always present in our response. if not 'parties' in docket: # If it's not present, we don't add to the dataframe and we exit the function. return # We loop through each dictionary within the parties list of dictionaries. for party in docket['parties']: # The counsel key will not always be present in the dictionary. if not 'counsel' in party: # If it's not, we don't write to the dataframe and we exit the function. return for counsel in party['counsel']: # We create the new row we want to add as a dictionary. # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in # case the key doesn't exist. new_attorneysAndFirms_row = { 'Docket Number': result.get('docket', None), 'Court Name': result.get('court', None), 'Attorney Name': counsel.get("name", None), 'Attorney Firm': counsel.get("firm", None), 'Attorney Email': counsel.get("email", None), 'Attorney Phone': counsel.get("phone", None), } # We append the global dataframe with the row we want represented as a dictionary. # ignore_index=True specifies that we don't want to generate an index column. global attorneysAndFirms appender = attorneysAndFirms.append(new_attorneysAndFirms_row, ignore_index=True) # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated. # We replace the original with the new version so our changes get saved. attorneysAndFirms = appender if input_csv != None: # The path to the input spreadsheet is the path that the user specified in the main menu. # The path where the JSON files will be downloaded to is the path that the user specified in the main menu. JSON_INPUT_OUTPUT_PATH = global_variables.JSON_INPUT_OUTPUT_PATH # The client matter is the string that the user specified in the main menu. CLIENT_MATTER = global_variables.CLIENT_MATTER IS_CACHED = global_variables.IS_CACHED # This list starts out empty, gets a tuple appended to it with every iteration of the loop below, and will eventually # be the value returned by this function. output_list_of_tuples = [] try: # We try to open the csv as a pandas dataframe. Pandas dataframes make working with tabular data in python faster and easier. df = pd.read_csv(input_csv) except Exception as e: # If there are any errors with opening the dataframe, we print the data to the console to alert the user. print(f"{e}") input() searchResults = [] # We loop through every row of the input spreadsheet, the row value allows us to access each value in each row through indexing. searching_from_csv_bar = Bar("Reading CSV, Querying Docket Alarm...", max=df.shape[0]) for index, row in df.iterrows(): # We use indexing to store each value in the appropriate variables so they are more human-readable. caseName = row[0] caseNo = row[1] caseCourt = row[2] # We place the values into a tuple that will serve as parameters for download_json_from_list_of_tuples() # when we call it inside the thread_download_json() wrapper. query = f"is:docket court:({caseCourt}) docket:({caseNo})" user = login.Credentials() searchResult = user_tools.search_docket_alarm( (user.username, user.password), query, limit=1, result_order=result_order) searchResults += searchResult searching_from_csv_bar.next() searching_from_csv_bar.finish() else: # After defining all of our nested functions, this is where the query_to_tables() function begins. # First we let the user know to wait, so they don't press any buttons that get entered as the input they will be prompted for when this is done loading. print("\n") print("Querying, please wait...") # We create our user object to log in. We can use attributes and methods to access the username, password, and authentication token of our currently signed in user. user = login.Credentials() # We run our search, using the query, the number of results, and the order that the user specified in the menu. searchResults = user_tools.search_docket_alarm( (user.username, user.password), query, limit=results_limit, result_order=result_order) searchResults = searchResults[0:results_limit] # We let the user know how many results were returned for their search and ask them to confirm to proceed. print( f"\nThis search query resulted in {len(searchResults)} results. Proceed? [Y/n]" ) # We store their answer in a variable. user_proceed_choice = input() # If the user says no... if user_proceed_choice.lower() == "n": # We do not proceed. The user is returned to the menu. menus.spreadsheet_generator_menu() # If answers something other than y or n (yes or no)... elif user_proceed_choice.lower() != "y" and user_proceed_choice.lower( ) != "n": # We let them know their response was invalid... print("Invalid response. Returning to menu.") # We pause the script until they press enter, so we know they're aware of whats happening... input() # And we return them to the menu. menus.spreadsheet_generator_menu() # If the user answers Y (yes), then the script continues. menus.clear() # We clear the menu and display ascii art in red. print(Fore.RED + menus.msg2) # We are about to initialize our progress bar. When we do this, we need to specify the maximum number of loops that the # progress bar is tracking. This gets passed as an argument. progressbar_maximum = len(searchResults) # We initialize our progress bar, specifying the text that will be displayed alongside the bar, and the maximum amount of loops # the bar will track. bar = Bar('Generating CSVs', max=progressbar_maximum) # The search results that are returned are a list of dictionaries. We begin to iterate through them. for result in searchResults: # We use the get_docket() function to return the docket data for every result in our search query. # To pull the docket, we specify the docket number and the court. We specify if the data is cached or uncached, and what the client matter is. docket = user_tools.get_docket( user.authenticate(), result['docket'], result['court'], cached=global_variables.IS_CACHED, client_matter=global_variables.CLIENT_MATTER) # through every iteration over our results, we pass the result data, and the docket data for each result to each of the # nested functions we defined at the beginning of this funciton. The dataframes that are declared as global variables at the # top of this module are appended with new data with each iteration. fill_docketInformation(result, docket) fill_docketEntries(result, docket) fill_parties(result, docket) fill_attorneysAndFirms(result, docket) # With each iteration, we move our progress bar forward until it hits its maximum. bar.next() # We get the current date and time to use in the name of the output folder we will generate. This helps us generate # unique folder names each time we run the script. timeNow = datetime.datetime.now().strftime("%I%M%p %B %d %Y") # The complete name of the folder will be the search entered, followed by the current time. # We use the cleanhtml function to remove any characters that are not allowed in file or folder names. # cleanhtml() is imported from get_pdfs.py. if input_csv == None: containing_folder_name = f"{cleanhtml(query)} - {timeNow}" else: containing_folder_name = f"{timeNow}" # We put together the absolute path to the folder we want to create and populate it. output_directory = os.path.join(output_path, containing_folder_name) # We check to see if the folder already exists... if not os.path.exists(output_directory): # If it doesn't, we create it. os.makedirs(output_directory) # We create strings for the absolute paths to each individual csv file we will be creating, with the .csv extension included. docketInformation_outputFile = os.path.join(output_directory, "docketInformation.csv") docketEntries_outputFile = os.path.join(output_directory, "docketEntries.csv") parties_outputFile = os.path.join(output_directory, "parties.csv") attorneysAndFirms_outputFile = os.path.join(output_directory, "attorneysAndFirms.csv") # We use the .to_csv() method on our dataframe object to save the filled out dataframes to csv files at the paths we specified above. # index=False specifies that we do not want to generate a numerical index column. docketInformation.to_csv(docketInformation_outputFile, index=False) docketEntries.to_csv(docketEntries_outputFile, index=False) parties.to_csv(parties_outputFile, index=False) attorneysAndFirms.to_csv(attorneysAndFirms_outputFile, index=False) # We set the progress bar to it's completed state. bar.finish()
def formatOTUtableData(OTU_table, max_level=14, tax_reassign_list=[]): '''This script reads in and formats an imported raw ASV table by adding \ taxonomy data. Parameters ---------- OTU_table : pandas.DataFrame This is the raw imported OTU (or ASV) table with index: OTU IDs header: a list of sample names followed by 'taxonomy' at the end max_level : int (optional) This is the maximum taxonomic level present in the dataset. \ The default is 14 (i.e., 'D_14__'). tax_reassign_list : dict (optional) List of taxonomic names in the dataset with the values they should \ be reassigned. The default is none. Returns ------- data : pandas.DataFrame Formatted data as a DataFrame. New headers include a full taxonomic breakdown samples : list List of samples in the dataset. ''' OTU_table = OTU_table.copy() # Get sample list samples = list(OTU_table.columns)[0:-1] # Reclassify any values with assignments in the tax_reassign_list if tax_reassign_list: for val in list(tax_reassign_list): OTU_table.loc[OTU_table['taxonomy'] == val, 'taxonomy'] = tax_reassign_list[val] # Format taxonomy list to read better print('Formatting taxonomy...') for i in np.arange(max_level + 1): delstr = 'D_' + str(i) + '__' OTU_table['taxonomy'] = OTU_table['taxonomy'].str.replace(delstr, '') # Break taxmap into levels taxlist = getUnique(OTU_table['taxonomy']) bar = Bar('', max=len(taxlist)) for value in taxlist: splitlist = [value] # Get list of levels if '; __' in value: splitlist = value.split('; __') elif '; ' in value: splitlist = value.split('; ') # Fix last level if needed if splitlist[-1]: if splitlist[-1][-1] == ';': splitlist[-1] = splitlist[-1][0:-1] else: splitlist = splitlist[0:-1] for L in range(1, min(len(splitlist) + 1, max_level + 1)): OTU_table.loc[OTU_table['taxonomy'] == value, 'L' + str(L)] = splitlist[L - 1] bar.next() bar.finish() # Get rid of nans in taxonomy levels end_level = len(OTU_table.columns) - len(samples) - 1 cols = levelCols(end_level) OTU_table[cols] = OTU_table[cols].replace(np.nan, '') # Convert values to float OTU_table[samples] = OTU_table[samples].astype(float) return OTU_table, samples
def train(train_loader, model, criterion, optimizer, epoch, use_cuda): # switch to train mode model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() bar = Bar('Processing', max=len(train_loader)) for batch_idx, (inputs, targets) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = torch.autograd.Variable( inputs), torch.autograd.Variable(targets) # compute output outputs = model(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) top5.update(prec5.item(), inputs.size(0)) # compute gradient optimizer.zero_grad() if args.half: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # with amp_handle.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() else: loss.backward() # do SGD step optimizer.step() if not args.linear_quantization: kmeans_update_model(model, quantizable_idx, centroid_label_dict, free_high_bit=args.free_high_bit) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress if batch_idx % 1 == 0: bar.suffix = \ '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | ' \ 'Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format( batch=batch_idx + 1, size=len(train_loader), data=data_time.val, bt=batch_time.val, total=bar.elapsed_td, eta=bar.eta_td, loss=losses.avg, top1=top1.avg, top5=top5.avg, ) bar.next() bar.finish() return losses.avg, top1.avg
#!/usr/bin/python # encoding: utf-8 # -*- coding: utf8 -*- """ Created by PyCharm. File: LinuxBashShellScriptForOps:progressOps.py User: Guodong Create Date: 2016/12/7 Create Time: 0:13 """ # https://pypi.python.org/pypi/progress/1.2 # pip used from progress.bar import Bar import time bar = Bar('Processing', max=20) for i in range(20): # Do some work time.sleep(1) bar.next() bar.finish()
def fastphot(SC_MAP, PSF_MAP, NOISE_MAP, Catalog, nb_process=4): """ Return flux of sources associated to given positions Parameters ---------- SC_MAP : numpy masked array. The SCientific MAP. PSF_MAP : numpy array The Point Spread Function MAP. NOISE_MAP : numpy masked array The Signal/Noise MAP. Catalog : numpy scrutured and masked array The source catalog. It must contain at least in input the source positions The Phot function allows to complete it by saving source fluxes nb_process : integer number of independant cpu(s) used to build A matrix and B vector by default we assume nb_process = 4 Returns ------- RESIDUAL_MAP : numpy masked array The residual map (SC_MAP - MODEL_MAP) bkg : float The background level """ # print('> PHOT') # # extract some information about maps and sources SC_MAP_npix_x, SC_MAP_npix_y = SC_MAP.shape PSF_MAP_npix_x, PSF_MAP_npix_y = PSF_MAP.shape # # Compress the input catalog to remove masked sources N_src = len(npy.ma.compressed(Catalog['ID'])) # # SC_MAP and NOISE_MAP have to be imersed in a "full" MAP # taking into acount an half PSF-size on the edges edge_x = int(math.floor(PSF_MAP_npix_x / 2)) edge_y = int(math.floor(PSF_MAP_npix_y / 2)) x_i = edge_x x_f = x_i + SC_MAP_npix_x y_i = edge_y y_f = y_i + SC_MAP_npix_y # SC_MAP SC_full_MAP = npy.zeros( [SC_MAP_npix_x + 2 * edge_x, SC_MAP_npix_y + 2 * edge_y]) # create SC_full_MAP[x_i:x_f, y_i:y_f] = SC_MAP # imerse # NOISE_MAP NOISE_full_MAP = npy.zeros( [SC_MAP_npix_x + 2 * edge_x, SC_MAP_npix_y + 2 * edge_y]) # create NOISE_full_MAP[x_i:x_f, y_i:y_f] = NOISE_MAP # imerse # # Create the mask MASK = (NOISE_full_MAP <= 0.e0) # # Convert SC_MAP and NOISE_MAP in masked array SC_full_MAP = npy.ma.array(SC_full_MAP, mask=MASK) NOISE_full_MAP = npy.ma.array(NOISE_full_MAP, mask=MASK) # # Init B and F vectors and A matrix B = npy.zeros(N_src + 1) A = npy.zeros([N_src + 1, N_src + 1]) F = npy.zeros(N_src + 1) # # Build Vectors and Matrix t_start = time() print(' > Build Vectors and Matrix') pool = mp.Pool(processes=nb_process) # (i, Bi, Ai_, A_) X_pos = npy.ma.compressed(Catalog['x_pos']) Y_pos = npy.ma.compressed(Catalog['y_pos']) R = [ pool.apply_async(Coef_i, args=(SC_full_MAP, NOISE_full_MAP, PSF_MAP, X_pos, Y_pos, si)) for si in range(N_src) ] # Reformat result, build A and B bar = Bar(' >', max=N_src) for ri in R: bar.next() r_i = ri.get() B[r_i[0]] = r_i[1] A[r_i[0], r_i[0]:N_src] = r_i[2] A[r_i[0]:N_src, r_i[0]] = r_i[2] A[r_i[0]][N_src] = r_i[3] A[N_src][r_i[0]] = r_i[3] bar.finish() # Complete B[N_src] = npy.nansum(SC_full_MAP / NOISE_full_MAP**2.) A[N_src][N_src] = npy.nansum(NOISE_full_MAP**(-2.)) # # Solve system print(' > Solve system') F = npy.linalg.solve(A, B) print(' > Compute uncertainties') dF = npy.diag(npy.linalg.inv(A[:N_src, :N_src])) t_end = time() # # Update FLux field in the catalog Catalog['flux'][~Catalog['ID'].mask] = F[:N_src] - npy.ones(len( F[:N_src])) * F[N_src] Catalog['dflux'][~Catalog['ID'].mask] = npy.sqrt(dF) # # Build residual MAP print(' > Build Residual Map') RESIDUAL_MAP = SC_MAP - model_MAP(SC_MAP, PSF_MAP, Catalog) # analysis_time = t_end - t_start m = int(math.floor(analysis_time / 60.)) s = analysis_time - m * 60 print( ' > %4.4i source(s) analysed in %3.3i min %3.1f sec [%5.3f sec / src]' % (N_src, m, s, analysis_time / float(N_src))) print('> DONE') return Catalog, F[N_src], RESIDUAL_MAP
def run_epoch(self, phase, epoch, data_loader): model_with_loss = self.model_with_loss if phase == 'train': model_with_loss.train() else: if len(self.opt.gpus) > 1: model_with_loss = self.model_with_loss.module model_with_loss.eval() torch.cuda.empty_cache() opt = self.opt results = {} data_time, batch_time = AverageMeter(), AverageMeter() avg_loss_stats = {l: AverageMeter() for l in self.loss_stats} num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters) end = time.time() for iter_id, batch in enumerate(data_loader): if iter_id >= num_iters: break data_time.update(time.time() - end) for k in batch: if k != 'meta': batch[k] = batch[k].to(device=opt.device, non_blocking=True) output, loss, loss_stats = model_with_loss(batch) loss = loss.mean() # print("orignal code loss is: ", loss) if phase == 'train': self.optimizer.zero_grad() loss.backward() self.optimizer.step() batch_time.update(time.time() - end) end = time.time() Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format( epoch, iter_id, num_iters, phase=phase, total=bar.elapsed_td, eta=bar.eta_td) for l in avg_loss_stats: avg_loss_stats[l].update(loss_stats[l].mean().item(), batch['input'].size(0)) Bar.suffix = Bar.suffix + '|{} {:.4f} '.format( l, avg_loss_stats[l].avg) if not opt.hide_data_time: Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \ '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time) if opt.print_iter > 0: if iter_id % opt.print_iter == 0: print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix)) else: bar.next() if opt.debug > 0: self.debug(batch, output, iter_id) if opt.test: self.save_result(output, batch, results) del output, loss, loss_stats bar.finish() ret = {k: v.avg for k, v in avg_loss_stats.items()} ret['time'] = bar.elapsed_td.total_seconds() / 60. return ret, results
def SVDC_heatmap_generatorv1(df, period_of_interest, prediction_year=2012, epidemic_classification_dict=None, training_year_window='ALL', t0_vector=None, p_vector=None, classifier='SVM', modes=[0], verbose=False): ''' - p_max, p_min: sets the bounds for the period length vector - period_of_interest = () #initial and final date that contains the period of interest (poi). the period of interest defines the starting and finishing dates for the SVD classifierself. e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end on 01-02 of the next year -prediction_year -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1} ''' #Generate grid based on p and t0 vectors distance_grid = np.zeros([len(p_vector), len(t0_vector)]) years = [] for i in range(df.index.shape[0]): years.append(df.index[i].year) years = sorted(list(set(years))) years_before_prediction = years.index(prediction_year) if training_year_window == 'ALL': training_years = years[0:years_before_prediction] n_years = years_before_prediction elif training_year_window < years_before_prediction: training_years = years[years_before_prediction - training_year_window:years_before_prediction] n_years = training_year_window else: print( "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size" .format(training_year_window)) if verbose: print('{0} years detected within dataframe: {1}.'.format( len(years), years)) print('{0} Years before prediction: {1}'.format( n_years, training_years)) # check if t0 dates are within dates_within_poi = [] for d in t0_vector: if '{0}'.format(prediction_year) + d[4:] in df[ period_of_interest[0]:period_of_interest[1]].index: dates_within_poi.append(d) if len(d) > 0: print( '{0} dates from t0_vector are inside period_of_interest range: {1}' .format(len(dates_within_poi), dates_within_poi)) #Enter main loop print('Initiating heatmap loop.') bar = Bar('Processing', max=len(p_vector)) for i, p in enumerate(p_vector): bar.next() for j, t0 in enumerate(t0_vector): if verbose: print('Reshaping data') X = SVDC_reshape_yearly_data_stolerman(df=df, t0=t0, p=p,\ years=training_years, \ upper_bound=period_of_interest[0],\ normalize=True, verbose=False) if verbose: print('Reshaping data done') ''' Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign the epidemic classification of year Y to the label for Y-1 ''' if X is not None: X_train = X[:, :-1] X_predict = X[:, -1] Y_train = [] for year in training_years[: -1]: # Can take out of loop but keeping for clear reading purposes Y_train.append(epidemic_classification_dict[year + 1]) Y_train = np.vstack(Y_train) Y_predict = epidemic_classification_dict[prediction_year] # Perform svd U, sigma, VT = svd(X_train, n_components=3, n_iter=15, random_state=None) projections = sigma.reshape([-1, 1]) * VT projections = projections.T projections = projections[:, modes] ''' Now that we got our projections from SVD we can create the classifier ''' mod = svm.SVC(kernel='rbf', gamma=1, C=1, cache_size=400, max_iter=100000) if verbose: ('Fitting with projections shape {0} and target shape {1}'. format(projections.shape, Y_predict)) mod.fit(projections, Y_train.ravel()) pred = mod.predict( np.matmul(X_predict.reshape([1, -1]), U[:, modes])) distance_grid[i, j] = (pred == Y_predict) else: distance_grid[i, j] = -1 bar.finish() return distance_grid
# compute for next stop herbie.recompute(mgr.city) herbie.plutocracy() if mgr.verbose: print() # first loop only if herbie.director is False: herbie.recompute(mgr.city) herbie.plutocracy() if not mgr.auto: mode = input("enter 'auto' to disable prompts: ") mgr.auto = True if mode == 'auto' else False # status if mgr.verbose: herbie.status() print() # travel in time and relative dimensions in space if len(herbie.requests) > 0: mgr.advance(herbie.requests) herbie.move(mgr.city, mgr.step) if not mgr.auto: nt = input("next turn? [enter 'auto' to disable prompts] ") mgr.auto = True if nt == 'auto' else False if not mgr.verbose: bar.finish() print('FINISHED! OFF DUTY!\n') herbie.queue(herbie.complete) ### END
def validation(model, val_loader, epoch, writer): # set evaluate mode model.eval() total_correct, total_label = 0, 0 total_correct_hb, total_label_hb = 0, 0 total_correct_fb, total_label_fb = 0, 0 hist = np.zeros((args.num_classes, args.num_classes)) hist_hb = np.zeros((args.hbody_cls, args.hbody_cls)) hist_fb = np.zeros((args.fbody_cls, args.fbody_cls)) # Iterate over data. bar = Bar('Processing {}'.format('val'), max=len(val_loader)) bar.check_tty = False for idx, batch in enumerate(val_loader): image, target, hlabel, flabel, _ = batch image, target, hlabel, flabel = image.cuda(), target.cuda( ), hlabel.cuda(), flabel.cuda() with torch.no_grad(): h, w = target.size(1), target.size(2) outputs = model(image) outputs = gather(outputs, 0, dim=0) preds = F.interpolate(input=outputs[0][-1], size=(h, w), mode='bilinear', align_corners=True) preds_hb = F.interpolate(input=outputs[1][-1], size=(h, w), mode='bilinear', align_corners=True) preds_fb = F.interpolate(input=outputs[2][-1], size=(h, w), mode='bilinear', align_corners=True) if idx % 50 == 0: img_vis = inv_preprocess(image, num_images=args.save_num) label_vis = decode_predictions(target.int(), num_images=args.save_num, num_classes=args.num_classes) pred_vis = decode_predictions(torch.argmax(preds, dim=1), num_images=args.save_num, num_classes=args.num_classes) # visual grids img_grid = torchvision.utils.make_grid( torch.from_numpy(img_vis.transpose(0, 3, 1, 2))) label_grid = torchvision.utils.make_grid( torch.from_numpy(label_vis.transpose(0, 3, 1, 2))) pred_grid = torchvision.utils.make_grid( torch.from_numpy(pred_vis.transpose(0, 3, 1, 2))) writer.add_image('val_images', img_grid, epoch * len(val_loader) + idx + 1) writer.add_image('val_labels', label_grid, epoch * len(val_loader) + idx + 1) writer.add_image('val_preds', pred_grid, epoch * len(val_loader) + idx + 1) # pixelAcc correct, labeled = batch_pix_accuracy(preds.data, target) correct_hb, labeled_hb = batch_pix_accuracy(preds_hb.data, hlabel) correct_fb, labeled_fb = batch_pix_accuracy(preds_fb.data, flabel) # mIoU hist += fast_hist(preds, target, args.num_classes) hist_hb += fast_hist(preds_hb, hlabel, args.hbody_cls) hist_fb += fast_hist(preds_fb, flabel, args.fbody_cls) total_correct += correct total_correct_hb += correct_hb total_correct_fb += correct_fb total_label += labeled total_label_hb += labeled_hb total_label_fb += labeled_fb pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label) IoU = round(np.nanmean(per_class_iu(hist)) * 100, 2) pixAcc_hb = 1.0 * total_correct_hb / (np.spacing(1) + total_label_hb) IoU_hb = round(np.nanmean(per_class_iu(hist_hb)) * 100, 2) pixAcc_fb = 1.0 * total_correct_fb / (np.spacing(1) + total_label_fb) IoU_fb = round(np.nanmean(per_class_iu(hist_fb)) * 100, 2) # plot progress bar.suffix = '{} / {} | pixAcc: {pixAcc:.4f}, mIoU: {IoU:.4f} |' \ 'pixAcc_hb: {pixAcc_hb:.4f}, mIoU_hb: {IoU_hb:.4f} |' \ 'pixAcc_fb: {pixAcc_fb:.4f}, mIoU_fb: {IoU_fb:.4f}'.format(idx + 1, len(val_loader), pixAcc=pixAcc, IoU=IoU, pixAcc_hb=pixAcc_hb, IoU_hb=IoU_hb, pixAcc_fb=pixAcc_fb, IoU_fb=IoU_fb) bar.next() print('\n per class iou part: {}'.format(per_class_iu(hist) * 100)) print('per class iou hb: {}'.format(per_class_iu(hist_hb) * 100)) print('per class iou fb: {}'.format(per_class_iu(hist_fb) * 100)) mIoU = round(np.nanmean(per_class_iu(hist)) * 100, 2) mIoU_hb = round(np.nanmean(per_class_iu(hist_hb)) * 100, 2) mIoU_fb = round(np.nanmean(per_class_iu(hist_fb)) * 100, 2) writer.add_scalar('val_pixAcc', pixAcc, epoch) writer.add_scalar('val_mIoU', mIoU, epoch) writer.add_scalar('val_pixAcc_hb', pixAcc_hb, epoch) writer.add_scalar('val_mIoU_hb', mIoU_hb, epoch) writer.add_scalar('val_pixAcc_fb', pixAcc_fb, epoch) writer.add_scalar('val_mIoU_fb', mIoU_fb, epoch) bar.finish() return pixAcc, mIoU