def pipeline_onehot(titles, descriptions, tags):
    # Create feature vectors of context and only keep images WITH context
    bar = Bar('Extracting features...', max=len(titles))
    docs = []
    for i in xrange(len(titles)):
        docs.append(u'{} {} {}'.format(titles[i], descriptions[i], ' '.join(tags[i])))

    vectorizer = CountVectorizer(min_df=5)
    X = vectorizer.fit_transform(docs)

    bar = Bar('Extracting features...', max=len(docs))
    idx_docs = []
    for idoc, doc in enumerate(docs):
        idxs    = X[idoc].nonzero()[1] + 1
        idxs    = idxs.tolist()
        idx_docs.append(idxs)
        bar.next()
    bar.finish()

    max_len = 500

    bar = Bar('Merging into one matrix...', max=len(idx_docs))
    for i, idx_doc in enumerate(idx_docs):
        features = np.zeros((1, max_len), np.int64)
        vec = np.array(idx_doc[:max_len])
        features[0, :vec.shape[0]] = vec

        if i == 0:
            feat_flatten = csr_matrix(features.flatten())
        else:
            feat_flatten = vstack([feat_flatten, csr_matrix(features.flatten())])
        bar.next()
    bar.finish()

    return feat_flatten, vectorizer
Beispiel #2
0
def saveAverageImage(kitti_base, pos_labels, shape, fname, avg_num=None):
    num_images = float(len(pos_labels))
    avg_num = min(avg_num, num_images)
    if avg_num is None:
        avg_num = num_images

    # avg_img = np.zeros((shape[0],shape[1],3), np.float32)
    avg_img = np.zeros(shape, np.float32)
    progressbar = ProgressBar('Averaging ' + fname, max=len(pos_labels))
    num = 0
    for label in pos_labels:
        if num >= avg_num:
            break
        num += 1
        progressbar.next()
        sample = getCroppedSampleFromLabel(kitti_base, label)
        # sample = np.float32(sample)

        resized = resizeSample(sample, shape, label)

        resized = auto_canny(resized)
        resized = np.float32(resized)

        avg_img = cv2.add(avg_img, resized / float(avg_num))
    progressbar.finish()

    cv2.imwrite(fname, avg_img)
Beispiel #3
0
def draw_poster(poster_text, textsize, inp):
    '''split out and highlight the words'''
    top_pad = 0.25
    left_pad = 9
    font = ImageFont.truetype("NotCourierSans.otf", textsize) #This font needs to be monopaced!
    im = Image.new("RGBA", (9933, 14043), "black") #A1 Size
    draw = ImageDraw.Draw(im) #Set up sheet to draw on
    
    print('Drawing text')
    bar = Bar('Processing', max=len(poster_text)) #Progress bar to entertain me while I watch this run
    
    for i, text in enumerate(poster_text):
        if "1969-07-21 02:56:48 CDR" in text:
            quote = "1969-07-21 02:56:48 CDR (TRANQ) That's one small step for man, one giant leap for mankind."
            text = text.split(quote)
            width_p1, h1 = draw.textsize(text[0], font=font)
            width_quote, h2 = draw.textsize(quote, font=font)
            draw.text((left_pad, int((i + top_pad) * textsize)), text[0], font=font, fill=(255,255,255,255)) #All text padded 4 pixels left
            draw.text((left_pad + width_p1, int((i + top_pad) * textsize)), quote, font=font, fill=(255,0,0,255)) 
            draw.text((left_pad + width_p1 + width_quote, int((i + top_pad) * textsize)), text[1], font=font, fill=(255,255,255,255))
            bar.next()
        else:
            draw.text((left_pad, int((i + top_pad) * textsize)), text, font=font, fill=(255,255,255,255))
            bar.next()
    bar.finish()
    
    print('Saving image!')    
    if inp == 'y':
        bleedx, bleedy = 10004, 14114
        bufferx, buffery = int((bleedx - 9933) / 2), int((bleedy - 14043) / 2)
        bleed_im = Image.new("RGBA", (10004, 14114), "black") #Bleed area for printing
        bleed_im.paste(im, (bufferx, buffery))
        bleed_im.save("output.png", "PNG")
    else:
        im.save("output.png", "PNG")
Beispiel #4
0
def hydrate(idlist_file="data/example_dataset_tweet_ids.txt"):
    """
    This function reads a file with tweet IDs and then loads them
    through the API into the database. Prepare to wait quite a bit,
    depending on the size of the dataset.
    """
    ids_to_fetch = set()
    for line in open(idlist_file, "r"):
        # Remove newline character through .strip()
        # Convert to int since that's what the database uses
        ids_to_fetch.add(int(line.strip()))
    # Find a list of Tweets that we already have
    ids_in_db = set(t.id for t in database.Tweet.select(database.Tweet.id))
    # Sets have an efficient .difference() method that returns IDs only present
    # in the first set, but not in the second.
    ids_to_fetch = ids_to_fetch.difference(ids_in_db)
    logging.warning(
        "\nLoaded a list of {0} tweet IDs to hydrate".format(len(ids_to_fetch)))

    # Set up a progressbar
    bar = Bar('Fetching tweets', max=len(ids_to_fetch), suffix='%(eta)ds')
    for page in rest.fetch_tweet_list(ids_to_fetch):
        bar.next(len(page))
        for tweet in page:
            database.create_tweet_from_dict(tweet)
    bar.finish()
    logging.warning("Done hydrating!")
def main():
    infile = raw_input('Input file name: ')
    if os.path.exists(infile):
        print '\n[!] Loading PCAP file. Please wait, it might take a while...'
        ips = sorted(set(p[IP].src for p in PcapReader(infile) if IP in p))

        total = len(ips)
        print '[!] Total number of IP addresses: %d\n' % total

        bar = Bar('Processing', max=total)
        for ip in ips:
            get_data(ip)
            bar.next()
        bar.finish()

        headers = ['IP', 'OWNER','COUNTRY', 'ORGANIZATION','SERVER','DESCRIPTION']
        print '\n\n'
        print tabulate(table,headers,tablefmt='grid')
        if exceptions:
            print '\nExceptions:'
            for e in exceptions:
                print '*\t%s' % e
            print '\n\n[!] Done.\n\n'
    else:
        print '[!] Cannot find file "%s"\n\tExiting...' % infile
        sys.exit()
Beispiel #6
0
def read_and_gen(lyric_path,file_path):
    """
    read file and generate mp3 sound file
    :param file_path:
    :return:
    """

    #remove original before adding new content in it
    if os.path.exists(file_path):
        os.remove(file_path)

    with open(lyric_path, encoding="utf-8") as file:
        file = file.readlines()
    bar = Bar('Processing', max=file.__len__())
    for line in file:
        if is_alphabet(line[0]):
            #line should be spoken in en
            speak = gtts_extends(line,lang='en')
            speak.sequence_save(file_path)

        if is_chinese((line[0])):
            speak = gtts_extends(line, lang='zh')
            speak.sequence_save(file_path)
        bar.next()
    bar.finish()
    print("transform success!")
def pipeline_pos(titles, descriptions, tags):
    def preprocess(inpt):
        return inpt

    # Create feature vectors of context and only keep images WITH context
    bar = Bar('Extracting features...', max=len(titles))
    pos_collection = []
    for i in xrange(len(titles)):
        # Stem words and remove stopwords for title...
        context = []
        title = preprocess(titles[i].split(' '))
        if title:
            context.append(title)
        # ... description (for each sentence) ...
        for desc in sent_tokenize(descriptions[i]):
            desc = preprocess(desc.split(' '))
            if desc:
                context.append(desc)
        # ... and tagsc
        ts = preprocess(tags[i])
        if ts:
            context.append(ts)
        
        pos = nltk.pos_tag_sents(context)
        pos = list(itertools.chain(*pos))
        pos_collection.append(pos)
        bar.next()
    bar.finish()

    return pos_collection
Beispiel #8
0
 def parse(self, dataset):
     """
     :type dataset: nala.structures.data.Dataset
     """
     outer_bar = Bar('Processing [SpaCy]', max=len(list(dataset.parts())))
     for part in dataset.parts():
         sentences = part.get_sentence_string_array()
         for index, sentence in enumerate(sentences):
             doc = self.nlp(sentence)
             for token in doc:
                 tok = part.sentences[index][token.i]
                 tok.features = {
                                 'id': token.i,
                                 'pos': token.tag_,
                                 'dep': token.dep_,
                                 'lemma': token.lemma_,
                                 'prob': token.prob,
                                 'is_punct': token.is_punct,
                                 'is_stop': token.is_stop,
                                 'cluster': token.cluster,
                                 'dependency_from': None,
                                 'dependency_to': [],
                                 'is_root': False,
                                }
                 part.tokens.append(tok)
             for tok in doc:
                 self._dependency_path(tok, index, part)
         part.percolate_tokens_to_entities()
         part.calculate_token_scores()
         part.set_head_tokens()
         outer_bar.next()
     outer_bar.finish()
     if self.constituency_parser == True:
         self.parser.parse(dataset)
class Closest(object):
    data = pd.DataFrame()
    cols = []
    bar = None

    def __init__(self, df, cols, size):
        self.data = df
        self.cols = cols
        self.bar = Bar(message="Compressing Time", max=size,
                       suffix="%(percent)d%% (%(index)d/%(max)d) ETA %(eta_td)s")
        return

    def __call__(self, row):
        self.bar.next()
        found = self.data[(self.data.restaurant_id == row.restaurant_id) & (self.data.date <= row.date)]
        if found.shape[0] == 0:
            # FIXME Do something smarter than averaging?
            found = self.data[(self.data.restaurant_id == row.restaurant_id)][self.cols].mean()
        else:
            found = found[self.cols].sum()
        # FIXME Sometimes NaNs appear if I am missing the restaurant ID.  What to do?
        found.fillna(0, inplace=True)
        row[self.cols] = found
        return row

    def __del__(self):
        self.bar.finish()
Beispiel #10
0
def keyadd(name):
    bar = Bar('Processing', max=5)
    try:
        bar.next()
        nova('keypair-add', '--pub-key', '~/.ssh/id_rsa.pub', '%s'
             % name)
    except:

        # print "Key add error on %s" % name

        bar.next()
        try:
            bar.next()

            # print "Tryig to delete key"

            result = nova('keypair-delete', '%s' % name)

            # print result
            # print "Tryig to add key"

            bar.next()
            results = nova('keypair-add', '--pub-key',
                           '~/.ssh/id_rsa.pub', '%s' % name)
        except:

            # print result

            print '''
Key deletion error on %s
''' % name
    bar.next()
    bar.finish()
    result = nova('keypair-list')
    print result
Beispiel #11
0
def torcURL(address, filename):
	print('cURL on ' + address + ' to ' + filename + '\n')
	bar = Bar('Running', max=100)
	for i in range(100):
		output = io.BytesIO()
		torcURL = pycurl.Curl()
		torcURL.setopt(pycurl.URL, address)
		torcURL.setopt(pycurl.PROXY, '127.0.0.1')
		torcURL.setopt(pycurl.PROXYPORT, SOCKS_PORT)
		torcURL.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME)
		torcURL.setopt(pycurl.WRITEFUNCTION, output.write)
		bar.next()
	bar.finish()

	try:
		torcURL.perform()
		return output.getvalue()
		fp = open(filename, 'wb')
		fp.write(output.getvalue().encode('utf-8').strip())
		fp.close()
	except KeyboardInterrupt:
		raise KeyboardInterrupt
	except pycurl.error as e:
		return "Unable to reach %s (%s)" % (address, e)
		UnknownError()
	except Exception as e:
		UnknownError()
def gradient_descent(X, Y, iter, alpha):
    (rows, cols) = X.shape
    Xt = X.T
    w = numpy.zeros((len(Xt), 1))
    print w.shape
    bar = Bar('iterations', max=iter)
    for i in range(0, iter):
        pw = w
        dw =  2*matrix.dot(matrix.dot(Xt,X), w) - matrix.dot(Xt, Y)

        # if (True):
        #     # print "alpha " + str(alpha)
        #     # print "E is " + str(dw.T.dot(dw).sum())
        #     # print dw
        #     print w
        w = w - alpha*dw/rows
        diff =numpy.absolute(w-pw).sum()
        print "Diff is %f " % diff
        if (diff < 0.000001):
            bar.finish()
            return w

        # raw_input()
        bar.next()
    bar.finish()
    return w
def main(args):
	d = json.load(open(args.c, 'r'))

	np.random.seed(1234)

	im2id  = {}
	id2cap = {}

	print 'img 2 id....'
	for im in d['images']:
		im2id[im['file_name']] = im['id']

	bar = Bar('id 2 cap...', max=len(d['annotations']))
	for ann in d['annotations']:
		cap = nltk.word_tokenize(ann['caption'])
		cap = ' '.join(cap).lower()
		if ann['image_id'] in id2cap:
			id2cap[ann['image_id']].append(cap)
		else:
			id2cap[ann['image_id']] = [cap]
		bar.next()
	bar.finish()

	with open(args.s, 'r') as f:
		images = f.read().split()

	refs = []
	for im in images:
		refs.append('<>'.join(id2cap[im2id[im]]))

	with open(args.saveto, 'w') as f:
		print >>f, '\n'.join(refs)
Beispiel #14
0
def evaluate(train_file_path, test_num, tagger, output_file_path):
    sents = parse_train_data(train_file_path)
    test_start = len(sents) - test_num - 1
    test_data = sents[test_start:len(sents)-1]
    train_data = sents[0:test_start+1]
    print 'Training with {0} sentences'.format(len(train_data))
    tagger.train(train_data)
    output = open(output_file_path, 'w')
    correct = 0
    total = 0
    bar = Bar('Testing with {0} sentences'.format(len(test_data)), max=len(test_data))
    for s in test_data:
        tagged = tagger.tag(remove_tags(s))
        # evaluate
        correct += evaluate_sentence(s, tagged)
        total += len(tagged)
        # write
        words = []
        for t in tagged:
            words.append(t[0] + '_' + t[1])
        output.write('\t'.join(words) + '\n')
        bar.next()
    bar.finish()
    output.close()
    return correct / float(total) * 100
def get_list(filename):
	"""
	Creates an array of objects out of 
	input training file
	==================================
	Returns:
		* array of objects where each
		object corresponds to a document
	==================================
	"""

	fo = open(filename)
	lines = fo.readlines()
	fo.close()
	total = len(lines)
	obj_arr = []
	vec_arr = []
	bar = Bar("Processing", max=total, suffix='%(percent)d%% | %(index)d of %(max)d | %(eta)d seconds remaining.')
	num = 0
	for each in lines:
		send_obj = files(each.split('\n')[0].split('\t'))
		send_obj.set_word_count(5)
		send_obj.set_pos_features()
		send_obj.set_punctuation_features()
		send_obj.set_vectors()
		obj_arr.append(send_obj)
		bar.next()
	bar.finish()
	return obj_arr
Beispiel #16
0
 def set_image_objects(self):
     landsat8 = "(acquisitionDate >= date'2013-01-01' AND acquisitionDate <= date'2016-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'OLI') AND (cloudCover <= 20)"
     landsat7 = "(acquisitionDate >= date'2003-01-01' AND acquisitionDate <= date'2016-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'ETM_SLC_OFF') AND (cloudCover <= 20)"
     landsat4_5 = "(acquisitionDate >= date'1982-01-01' AND acquisitionDate <= date'2011-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'TM') AND (cloudCover <= 20)"
     landsat1_5 = "(acquisitionDate >= date'1972-01-01' AND acquisitionDate <= date'2013-12-31') AND (dayOfYear >=1 AND dayOfYear <= 366) AND (sensor = 'MSS') AND (cloudCover <= 20)"
     queries_name = ["landsat8","landsat7","landsat4_5","landsat1_5"]
     queries = [landsat8,landsat7,landsat4_5,landsat1_5]
     # query = self._query(parms)
     obj = []
     count = 0
     for q in queries:
         parms = {
             "f":"json",
             "where":q,
             "geometry":self.bounding_box["geometry"],
             "returnGeometry":"false",
             "spatialRel":"esriSpatialRelIntersects",
             "geometryType":"esriGeometryEnvelope",
             "inSR":self.bounding_box["geometry"]["spatialReference"]["wkid"],
             "outSR":self.bounding_box["geometry"]["spatialReference"]["wkid"],
             "outFields":"*",
             "orderByFields":"dayOfYear"
         }
         query = self._query(parms)
         bar = Bar("Requesting data: "+queries_name[count] , max=len(queries))
         for i in query["features"]:
             obj.append(i)
             bar.next()
         bar.finish()
         count = count + 1
     return obj
def tokenize_proteins(data, msg='Processing proteins'):
    """Distribute all poses into either decoys list or actives OrderedDict.
    Poses placed into the actives OrderedDict are further organized into
    sublists for each ligand.

    args:
        @data list of string lines containing pose data
        @msg string message to display in progress bar
    returns:
        @actives OrderedDict of all active poses gathered from data
        @decoys list of all decoy poses gathered from data
    """

    actives = OrderedDict()
    decoys  = list()
    bar = Bar(msg, max=len(data))

    for i, line in enumerate(data):
        bar.next()
        pose = posedict(line)  # Token -> List
        if pose['label'] == 1:  # Pose -> Decoys
            pose['id'] = pose['ligand'] + '-' + str(i)
            actives.setdefault(pose['ligand'], []).append(pose)
        else:  # Pose -> Actives
            decoys.append(pose)
    bar.finish()
    print ""

    return actives, decoys
Beispiel #18
0
def main(argv):
  args = argparser.parse_args()

  print >> sys.stderr, '# Start: Keyword Data: %s, %s, %s, %s' % (args.cc, args.week, args.pages, datetime.datetime.now().time().isoformat())

  ga, gsc = initialize_service(argv, "analytics"), initialize_service(argv, "webmasters")

  print '"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"\t"%s"' % ("cc", "website", "url", "date", "keyword", "impressions", "clicks", "ctr", "position", "sessions (week)")
  
  bar = Bar('Processing', max=args.pages, suffix ='%(percent).1f%% - %(eta)ds')
  
  for website in GA_IDS[args.cc]:

    urls = get_top_landing_pages(ga, args.cc, website, args.week, args.pages)
    for row in urls:

      data = []

      # we switched from http to https between week 3 and 4
      if (args.week <= 4 and args.cc != 'VN') or website != "IPRICE":
        data.extend(get_keyword_data(gsc, args.cc, website, args.week, row[0][1:], "http"))
      if (args.week >=3 or args.cc == 'VN') and website == "IPRICE":
        data.extend(get_keyword_data(gsc, args.cc, website, args.week, row[0][1:], "https"))

      output(args.cc, website, row[0], row[1], data)

      bar.next()
    bar.finish()
      
  print >> sys.stderr, '# End: Keyword Data: %s, %s, %s, %s' % (args.cc, args.week, args.pages, datetime.datetime.now().time().isoformat())
Beispiel #19
0
def clean():
    """kills all the instances with prefix prefix-* and in error state"""

    global servers
    global bar
    try:
        _refresh_servers()
        if len(servers) == 0:
            print 'Found 0 instances to kill'
        else:
            for index in servers:
                server = servers[index]
                if server['status'] == 'ERROR':
                    list.append(index)
                    names.append(server['name'])
            print 'Starting parallel Delete'
            bar = Bar('Deleting', max=len(servers) + 3)
            bar.next()
            pool = Pool(processes=maxparallel)
            bar.next()
            result = pool.map(_del_server, list)
            bar.next()
            bar.finish()
    except:
        print 'Found 0 instances with status error to kill'
    menu()
Beispiel #20
0
def kill():
    """kills all the instances with prefix prefix-*"""

    global servers
    global bar
    try:
        list = []
        names = []
        _refresh_servers()
        if len(servers) == 0:
            print 'Found 0 instances to kill'
        else:
            for index in servers:
                server = servers[index]
                print 'Found %(name)s to kill' % server
                list.append(index)
                names.append(server['name'])
            bar = Bar('Deleting', max=len(servers) + 3)
            bar.next()
            pool = Pool(processes=maxparallel)
            bar.next()
            result = pool.map(_del_server, list)
            bar.next()
            bar.finish()
    except:

        # print e

        print 'Found 0 instances to kill'
    menu()
Beispiel #21
0
def main(argv):
	args = argparser.parse_args()

	print >> sys.stderr, '# Start: Matching: %s' % (datetime.datetime.now().time().isoformat())

	masterbrain = read(args.masterbrain)
	keywords = read(args.keywords)

	bar = Bar('Processing', max=len(masterbrain), suffix ='%(percent).1f%% - %(eta)ds')

	regex = {}
	for keyword in keywords:
		regex[keyword] = re.compile(r'\b({0})\b'.format(keyword))

	matches = 0
	for string in masterbrain:
		for keyword in keywords:
			if regex[keyword].search(string):
				matches = matches + 1
				print 1, "\t", string, "\t", keyword
				break
		else:
			print 0, "\t", string
		bar.next()

	bar.finish()

	print matches, "/", len(masterbrain)

	print >> sys.stderr, '# End: Matching: %s' % (datetime.datetime.now().time().isoformat())
Beispiel #22
0
def main(argv):
	args = argparser.parse_args()

	print >> sys.stderr, '# Start: Adwords Data: %s, %s' % (args.cc, datetime.datetime.now().time().isoformat())

	service = initialize_service()
	keywords = read_file(args.file)

	print '"%s"\t"%s"\t"%s"\t"%s"' % ("keyword", "sv (month)", "competition", "cpc ($)")

	bar = Bar('Processing', max=len(keywords), suffix ='%(percent).1f%% - %(eta)ds')
	if args.stats:
		# pagination of 800 items
		kws = keywords
		while len(kws) > 0:
			page = kws[0:PAGE_SIZE]
			kws = kws[PAGE_SIZE:]

			output(query_adwords(service, args.cc, page, "STATS"))

			bar.next(len(page))

	elif args.ideas:
		# pagination of 1 item
		for kw in keywords:
			output(get_keyword_suggestions(service, args.cc, "IDEAS"))

			bar.next()

	bar.finish()
	
	print >> sys.stderr, '# End: Adwords Data: %s, %s' % (args.cc, datetime.datetime.now().time().isoformat())
Beispiel #23
0
def average_image(pos_region_generator, shape, avg_num=None):
    pos_regions = list(pos_region_generator)

    num_images = float(len(pos_regions))
    if avg_num is None:
        avg_num = num_images
    else:
        avg_num = min(avg_num, num_images)

    window_dims = (shape[1], shape[0])

    # avg_img = np.zeros((shape[0],shape[1],3), np.float32)
    avg_img = np.zeros(shape, np.float32)
    progressbar = ProgressBar('Averaging ', max=avg_num)
    num = 0
    for reg in pos_regions:
        if num >= avg_num:
            break
        num += 1
        progressbar.next()

        resized = reg.load_cropped_resized_sample(window_dims)

        resized = auto_canny(resized)
        resized = np.float32(resized)

        avg_img = cv2.add(avg_img, resized / float(avg_num))
    progressbar.finish()

    return avg_img
 def getUsers(hubname):
     log = open(HubAnalyzer.logfile, "a")
     print("hub: " + hubname + " ----------------- ", file=log)
     print(time.strftime("%H:%M:%S"), file=log)
     # clean the file to write users to
     url = HubAnalyzer.hubname2link(hubname)
     output_filename = "data/hubs/" + hubname
     # if data is here, do nothing
     if os.path.isfile(output_filename) and not HubAnalyzer.enforce_download_in_presence_of_data:
         print("data is already here, abort this url", file=log)
         return None
     output_file = open(output_filename, "w")
     try:
         last_page_num = int(HubAnalyzer.getLastPageNumber(url))
     except Exception as err:
         print("URL is broken, abort the url", file=log)
         log.flush()
         os.remove(output_filename)
         raise Exception("Cannot analyze the page, please, check the url below: \n" + url)
     # get connection to habrahabr-hub
     suffix = "/subscribers/rating/page"
     userlist_url = url + suffix
     http = urllib3.PoolManager()
     if HubAnalyzer.report_downloading_progress:
         HubAnalyzer.get_hub_description(hubname)
         bar = Bar("Downloading: " + hubname, max=last_page_num, suffix="%(percent)d%%")
     for i in range(1, last_page_num + 1):
         user_page = userlist_url + str(i)
         print(user_page, file=log)
         log.flush()
         try:
             response = http.request("GET", user_page)
         except urllib3.exceptions.HTTPError as err:
             if err.code == 404:
                 print(user_page + " !! 404 !!", file=log)
                 log.flush()
                 output_file.close()
                 os.remove(output_filename)
                 raise ("Hub is not found, please, check the url")
             else:
                 print(user_page + " PARSING ERROR ", file=log)
                 log.flush()
                 output_file.close()
                 os.remove(output_filename)
                 raise Exception("Error: cannot parse the page!")
         html = response.data
         soup = BeautifulSoup(html)
         usersRow = soup.find_all(class_="user ")
         for userRow in usersRow:
             username = userRow.find(class_="username").text
             print(username, file=output_file)
         output_file.flush()
         if HubAnalyzer.report_downloading_progress:
             bar.next()
     # finalize and close everything
     if HubAnalyzer.report_downloading_progress:
         bar.finish()
     output_file.close()
     log.close()
def main():
    attempt = 0
    data = None
    while not data and attempt < 3:
        attempt += 1
        try:
            request = urllib2.Request('http://openweathermap.org/help/city_list.txt')
            response = urllib2.urlopen(request)
            data = response.readlines()
        except:
            print "failed %d times, trying again" % attempt
    if not data:
        print "the program fail, please check your internt and access the program again"
        sys.exit()
    country_initials = str()
    firstline = True
    count = 0
    length = list()
    length = len(data)
    bar = Bar('Processing', max=length)
    for line in data:
        # Appending each line that the country initials are equal to the user input
        if firstline:
            firstline = False
            continue
        count += 1
        country_initials = line[-3] + line[-2]
        elements = line.split()
        city = " ".join(elements[1:-3])
        city = city.strip().replace(" ", "-")
        connection = pymysql.connect(host='localhost',
                                     user='******',
                                     password='******',
                                     db='weather',
                                     cursorclass=pymysql.cursors.SSCursor)
        with connection.cursor() as cursor:
            try:
                cursor.execute("select count(*) from weather.cities where name = %s", city)
                city_exists = cursor.fetchone()[0]
                if city_exists == 1:
                    cursor.execute(
                        "select TIMESTAMPDIFF(minute,(select last_updated from weather.cities where name = %s), now())",
                        city)
                    time_dif = cursor.fetchone()[0]
                    if time_dif > 60:
                        update_attributes(city, country_initials)
                        bar.next()
                    else:
                        bar.next()
                        continue

                else:
                    update_attributes(city, country_initials)
                    bar.next()
            except UnicodeEncodeError:
                pass
        connection.commit()
    bar.finish()
Beispiel #26
0
def editorial_publish(guides,
                      endpoint,
                      function_class,
                      user_agent,
                      nailgun_bin,
                      content_generator):
    """
    takes care of publishing the editorial content for the guides.
    """

    # init the nailgun thing for ed content generation.
    nailguninit(nailgun_bin,content_generator)


    searches= {}

    pbar = Bar('extracting editorial content for guides:',max=len(guides)+1)
    pbar.start()

    error = False
    for i, guide in enumerate(guides):
        jsonguide = None
        with open(guide,'r') as g:
            jsonguide = json.load(g)

        if not jsonguide:
            logging.error('could not load json from {0}'.format())
            error = True
            continue
        search = cityinfo.cityinfo(jsonguide)
        uri = cityres.cityres(search,endpoint)
        if not uri:
            logging.error(
                    'no dbpedia resource was found for {0}'.format(guide))
            error = True
            continue
        urls = urlinfer.urlinferdef([unquote(uri)])
        if len(urls) < 1:
            logging.error('no wikipedia/wikivoyage urls found/inferred'\
                   ' for resource {0}'.format(uri))
            error = True
            continue
        content = editorial_content(urls,function_class,user_agent)
        if not content:
            logging.error('no editorial content could be'\
                    ' generated for {0}'.format(guide))
            error = True
            continue

        #insert the content into the guide
        jsonsert.jsonsert(content, guide)

        logging.info('editorial content for {0} sucessfully'\
                ' inserted.'.format(guide))
        pbar.next()

    pbar.finish()
    return error
    def evolve(self, population, cxpb, mutpb, mutfq, ngen, goal):

        # Cheapest classifier.
        clf = LinearRegression(normalize=True)

        # Evaluate fitnesses of starting population.
        fitness_list = map(lambda x: self.evaluate(x, clf), population)

        # Assign fitness values.
        for individual, fitness in zip(population, fitness_list):
            individual.fitness.values = fitness

        best = max(population, key=lambda x: x.fitness.values[0])

        # So that we know things are happening.
        bar = Bar('Evolving', max=ngen)

        # Evolution!
        for gen in xrange(ngen):

            if best.fitness.values[0] > goal:
                break

            # Select the next generation of individuals.
            offspring = []
            offspring.append(best)
            offspring += tools.selTournament(population, len(population)-1, 10)
            offspring = map(self.toolbox.clone, offspring)

            # Apply crossovers.
            for child_a, child_b in zip(offspring[::2], offspring[1::2]):  # Staggered.
                if random.random() < cxpb:
                    self.crossover(child_a, child_b, cxpb)
                    del child_a.fitness.values
                    del child_b.fitness.values

            # Apply mutations.
            for child in offspring:
                if random.random() < mutpb:
                    self.mutate(child, mutfq)
                    del child.fitness.values

            # Reevaluate fitness of changed individuals.
            new_children = [e for e in offspring if not e.fitness.valid]
            fitness_list = map(lambda x: self.evaluate(x, clf), population)
            for individual, fitness in zip(new_children, fitness_list):
                individual.fitness.values = fitness

            # Replace old population with new generation.
            best = max(population, key=lambda x: x.fitness.values[0])
            population = offspring

            # Progress!
            bar.next()

        # Done! Return the most fit evolved individual.
        bar.finish()
        return best
def do_epoch(mode, epoch, skipped=0):
    # mode is 'train' or 'test'
    y_true = []
    y_pred = []
    avg_loss = 0.0
    prev_time = time.time()

    batches_per_epoch = dmn.get_batches_per_epoch(mode)

    if mode=="test":
        batches_per_epoch=min(1000,batches_per_epoch)
    bar=Bar('processing',max=batches_per_epoch)
    for i in range(0, batches_per_epoch):
        step_data = dmn.step(i, mode)
        prediction = step_data["prediction"]
        answers = step_data["answers"]
        current_loss = step_data["current_loss"]
        current_skip = (step_data["skipped"] if "skipped" in step_data else 0)
        log = step_data["log"]

        skipped += current_skip

        if current_skip == 0:
            avg_loss += current_loss

            for x in answers:
                y_true.append(x)

            for x in prediction.argmax(axis=1):
                y_pred.append(x)

            # TODO: save the state sometimes
            if (i % args.log_every == 0):
                cur_time = time.time()
                #print ("  %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t skipped: %d \t %s \t time: %.2fs" %
                #    (mode, epoch, i * args.batch_size, batches_per_epoch * args.batch_size,
                #     current_loss, avg_loss / (i + 1), skipped, log, cur_time - prev_time))
                prev_time = cur_time

        if np.isnan(current_loss):
            print "==> current loss IS NaN. This should never happen :) "
            exit()
        bar.next()
    bar.finish()

    avg_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (mode, avg_loss)
    print "confusion matrix:"
    print metrics.confusion_matrix(y_true, y_pred)

    accuracy = sum([1 if t == p else 0 for t, p in zip(y_true, y_pred)])
    print "accuracy: %.2f percent" % (accuracy * 100.0 / batches_per_epoch / args.batch_size)

    if len(accuracies)>0 and accuracies[-1]>accuracy:
        dmn.lr=dmn.lr*args.learning_rate_decay
    accuracies.append(accuracy)
    return avg_loss, skipped
Beispiel #29
0
def save_regions(reg_gen, num_regions, window_dims, save_dir):
    progressbar = ProgressBar('Saving regions', max=num_regions)
    index = 0
    for img_region in itertools.islice(reg_gen, 0, num_regions):
        fname = os.path.join(save_dir, '{:06d}.png'.format(index))
        index += 1
        sample = img_region.load_cropped_resized_sample(window_dims)
        cv2.imwrite(fname, sample)
        progressbar.next()
    progressbar.finish()
 def get_stale_files(self, media_files):
     django_models_with_file_fields = self.get_django_models_with_file_fields()
     stale_files = []
     bar = Bar('Analyzing media files', max=len(media_files))
     for media_file in media_files:
         if not self.remove_file_if_not_exists_in_db(media_file, django_models_with_file_fields):
             stale_files.append(media_file)
         bar.next()
     bar.finish()
     return stale_files
Beispiel #31
0
def get_darwin_dataset(img_dir, train_val):

    json_file = os.path.join(img_dir, train_val, train_val + ".json")
    with open(json_file) as f:
        imgs = json.load(f)

    imgs = imgs[0:10]

    dataset_dicts = []

    bar = Bar('Importing Dataset', max=len(imgs))

    for idx, img in enumerate(imgs):

        record = {}

        filename = os.path.join(img_dir, 'images',
                                img["image"]["original_filename"])
        height, width = cv2.imread(filename).shape[:2]

        record["file_name"] = filename
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width

        annos = img["annotations"]
        objs = []

        for anno in annos:

            poly, bbox = convert_to_rle(anno, height, width)

            #check bounding boxes are healthy
            # test_mask = pycocotools.mask.decode(poly)
            # mask_img = Image.fromarray(test_mask.astype(np.bool)).convert('RGB')
            # draw = ImageDraw.Draw(mask_img)
            # draw.rectangle(bbox, fill=None, outline='red', width=3)
            # mask_img.save('img.jpg')

            obj = {
                "bbox": bbox,
                "bbox_mode": BoxMode.XYXY_ABS,
                "segmentation": poly,
                "category_id":
                0,  # change in the future for more than one category
            }

            objs.append(obj)

        record["annotations"] = objs
        dataset_dicts.append(record)

        # # check masks are healthy
        # test_mask = np.zeros([height, width, len(record["annotations"])], dtype=np.uint8)
        # for idx, obj in enumerate(record["annotations"]):
        #     # decode RLE for all objects, create global mask and save image
        #     test_mask[:,:,idx] = pycocotools.mask.decode(obj['segmentation'])
        # Image.fromarray(np.sum(test_mask, axis=2).astype(np.bool)).save('masks/' + img["image"]["original_filename"])

        bar.next()

    bar.finish()

    return dataset_dicts
    def update(self, _entries, progress=True):
        MongoDBController().start_if_not_running()
        if type(_entries) == dict:
            entries = [_entries]
        else:
            entries = _entries

        if progress:
            bar = Bar('Cloudmesh Database Update', max=len(entries))

        result = []
        for entry in entries:
            if progress:
                bar.next()
            if 'cm' not in entry:
                print("UPDATE ERROR")
                VERBOSE(entry)
                raise ValueError("The cm attribute is not in the entry")
            entry['cm']['collection'] = "{cloud}-{kind}".format(**entry["cm"])

            # noinspection PyUnusedLocal
            try:
                self.col = self.db[entry['cm']['collection']]

                old_entry = self.col.find_one({
                    "cm.kind": entry["cm"]["kind"],
                    "cm.cloud": entry["cm"]["cloud"],
                    "cm.name": entry["cm"]["name"]
                })

                if old_entry is not None:

                    cm = dict(old_entry['cm'])

                    cm.update(entry['cm'])
                    cm['modified'] = str(datetime.utcnow())

                    # entry['cm']['created'] = cm['created']
                    entry['cm'] = cm

                    post = self.col.replace_one(
                        {
                            "cm.kind": entry['cm']["kind"],
                            "cm.cloud": entry['cm']["cloud"],
                            "cm.name": entry['cm']["name"]
                        },
                        entry,
                        upsert=True)

                else:
                    entry['cm']['created'] = entry['cm']['modified'] = str(
                        datetime.utcnow())
                    self.col.insert_one(entry)

            except Exception as e:
                Console.error(
                    "uploading document\n{entry}\n-------\n\n".format(
                        entry=str(entry)))
                pass
            result.append(entry)

        if progress:
            bar.finish()

        return result
def SVDC_heatmap_generator(df, period_of_interest, prediction_year=2012, \
                           epidemic_classification_dict=None, training_year_window='ALL', t0_vector=None, \
                           p_vector=None, classifier='SVM', modes=[0], add_peaks=False,\
                           add_runoff_binary=False, verbose=False, variables=['precip', 'temp']):
    '''

    - p_max, p_min: sets the bounds for the period length vector
    - period_of_interest = () #initial and final date that contains the period of interest (poi).
    the period of interest defines the starting and finishing dates for the SVD classifierself.
    e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end
    on 01-02 of the next year
    -prediction_year
    -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1}


    v2.:
    Version two of heatmap generators utilized 3 modes rather than 2 and also incorporates the average number of peaks
    as extra dimensions prior to the classifier phase
    '''

    #Generate grid based on p and t0 vectors
    distance_grid = np.zeros([len(p_vector), len(t0_vector)])

    years = []
    for i in range(df.index.shape[0]):
        years.append(df.index[i].year)
    years = sorted(list(set(years)))

    years_before_prediction = years.index(prediction_year)

    if training_year_window == 'ALL':
        training_years = years[0:years_before_prediction]
        n_years = years_before_prediction
    elif training_year_window < years_before_prediction:
        training_years = years[years_before_prediction -
                               training_year_window:years_before_prediction]
        n_years = training_year_window
    else:
        print(
            "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size"
            .format(training_year_window))

    if verbose:
        print('{0} years detected within dataframe: {1}.'.format(
            len(years), years))
        print('{0} Years before prediction: {1}'.format(
            n_years, training_years))

    # check if t0 dates are within
    dates_within_poi = []
    for d in t0_vector:
        if '{0}'.format(prediction_year) + d[4:] in df[
                period_of_interest[0]:period_of_interest[1]].index:
            dates_within_poi.append(d)

    if len(d) > 0:
        print(
            '{0} dates from t0_vector are inside period_of_interest range: {1}'
            .format(len(dates_within_poi), dates_within_poi))

    #Enter main loop
    print('Initiating heatmap loop.')
    bar = Bar('Processing', max=len(p_vector))
    for i, p in enumerate(p_vector):
        bar.next()
        for j, t0 in enumerate(t0_vector):

            if verbose: print('Reshaping data')
            X = SVDC_reshape_yearly_data_stolerman(df=df[variables], t0=t0, p=p,\
                                                   years=training_years, \
                                                   upper_bound=period_of_interest[0],\
                                                   normalize=True, verbose=False)

            if verbose: print('Reshaping data done')
            '''
            Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y
            we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign
            the epidemic classification of year Y to the label for Y-1
            '''
            if X is not None:
                X_train = X[:, :-1]
                X_predict = X[:, -1]
                Y_train = []
                for year in training_years[:
                                           -1]:  # Can take out of loop but keeping for clear reading purposes
                    Y_train.append(epidemic_classification_dict[year + 1])

                Y_train = np.vstack(Y_train)
                Y_predict = epidemic_classification_dict[prediction_year]

                # Perform svd
                U, sigma, VT = svd(X_train,
                                   n_components=3,
                                   n_iter=15,
                                   random_state=None)
                projections = sigma.reshape([-1, 1]) * VT
                projections = projections.T
                projections = np.vstack([
                    projections[:, modes],
                    np.matmul(X_predict.reshape([1, -1]), U[:, modes])
                ])
                '''
                if not np.equal(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]).reshape(1,-1)).all():
                    print('WARNING! projections and prediction sample matmul are not equal')
                    print(projections[-1,:], np.matmul(X_predict.reshape([1,-1]),U[:,modes]))
                    time.sleep(10)
                if verbose:
                    print('Verifying predict_projection is correct = {0},{1}, {2}'.format(projections,projection_predict, np.matmul(X_predict.reshape([1,-1]),U[:,modes])))
                '''
                '''
                Merging SVD projections average_peak_frequencies for each year. They should have the same length
                '''

                if add_peaks:
                    # This function returns the delta value stated in Stolerman's paper
                    average_peak_frequencies = SVDC_get_apfs(df=df, t0=t0, p=p,\
                                                           years=training_years, \
                                                           upper_bound=period_of_interest[0],\
                                                           normalize=True, verbose=False)
                    classifier_dataset = np.hstack(
                        [projections, average_peak_frequencies])
                else:
                    classifier_dataset = projections

                if add_runoff_binary:
                    # This function returns the delta value stated in Stolerman's paper
                    average_runoff = SVDC_get_runoffbinary(df=df, t0=t0, p=p,\
                                                           years=training_years, \
                                                           upper_bound=period_of_interest[0],\
                                                           normalize=True, verbose=False)
                    classifier_dataset = np.hstack(
                        [projections, average_runoff])

                else:
                    classifier_dataset = projections

                classifier_dataset_train = classifier_dataset[:-1, :]
                classifier_dataset_predict = classifier_dataset[-1, :]

                if verbose:
                    print(classifier_dataset_train, classifier_dataset_predict)
                if classifier == 'svm':
                    mod = svm.SVC(kernel='rbf',
                                  gamma=1,
                                  C=1,
                                  cache_size=400,
                                  max_iter=100000)
                elif classifier == 'forest':
                    mod = RandomForestClassifier(n_estimators=10,
                                                 max_depth=2,
                                                 random_state=0)
                if verbose:
                    ('Fitting with projections shape {0} and target shape {1}'.
                     format(classifier_dataset_train.shape, Y_predict))

                mod.fit(classifier_dataset_train, Y_train.ravel())
                pred = mod.predict(classifier_dataset_predict.reshape(1, -1))
                distance_grid[i, j] = (pred == Y_predict)
            else:
                distance_grid[i, j] = -1
    bar.finish()
    return distance_grid
Beispiel #34
0
def step(split, epoch, opt, data_loader, model, optimizer=None):
    if split == 'train':  ##### 主要针对 batch normalization 和 dropout
        model.train()
    else:
        model.eval()

    crit = torch.nn.MSELoss()  ### 定义损失函数

    acc_idxs = data_loader.dataset.acc_idxs
    edges = data_loader.dataset.edges
    shuffle_ref = data_loader.dataset.shuffle_ref
    mean = data_loader.dataset.mean
    std = data_loader.dataset.std
    convert_eval_format = data_loader.dataset.convert_eval_format

    Loss, Acc = AverageMeter(), AverageMeter()
    data_time, batch_time = AverageMeter(), AverageMeter()
    preds = []

    nIters = len(data_loader)
    bar = Bar('{}'.format(opt.exp_id), max=nIters)

    end = time.time()
    for i, batch in enumerate(data_loader):
        data_time.update(time.time() - end)
        input, target, meta = batch['input'], batch['target'], batch['meta']
        input_var = input.cuda()
        target_var = target.cuda()  #### 【16,64,64】的关键点热图
        output = model(input_var)
        #### 预测与真实heatmap热图之间的损失计算  shape: [batch, 16, 64, 64]
        loss = crit(output[-1]['hm'], target_var)  #### 计算损失   ### tensor([xxxx*xxx], cuda:0)   tensor值  torch.cuda.FloatTensor

        for k in range(opt.num_stacks - 1):
            loss += crit(output[k], target_var)

        if split == 'train':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            input_ = input.cpu().numpy().copy()
            input_[0] = flip(input_[0]).copy()[np.newaxis, ...]
            # input_flip_var = torch.from_numpy(input_).cuda(device=opt.device, non_blocking=True)
            input_flip_var = torch.from_numpy(input_).cuda()
            output_flip = model(input_flip_var)
            output_flip = shuffle_lr(
                flip(output_flip[-1]['hm'].detach().cpu().numpy()[0]), shuffle_ref)
            output_flip = output_flip.reshape(
                1, opt.num_output, opt.output_h, opt.output_w)
            ####### output_ = (output[-1].detach().cpu().numpy() + output_flip) / 2
            # output_flip = torch.from_numpy(output_flip).cuda(device=opt.device, non_blocking=True)
            output_flip = torch.from_numpy(output_flip).cuda()
            output[-1]['hm'] = (output[-1]['hm'] + output_flip) / 2
            pred, conf = get_preds(output[-1]['hm'].detach().cpu().numpy(), True)  #### 获取每个热图的峰值点作为可能的骨骼点位置
            preds.append(convert_eval_format(pred, conf, meta)[0])

        Loss.update(loss.detach()[0], input.size(0))
        Acc.update(accuracy(output[-1]['hm'].detach().cpu().numpy(),
                            target_var.detach().cpu().numpy(),
                            acc_idxs))  #### acc_idxs = [0, 1, 2, 3, 4, 5, 10, 11, 14, 15]

        batch_time.update(time.time() - end)
        end = time.time()
        if not opt.hide_data_time:
            time_str = ' |Data {dt.avg:.3f}s({dt.val:.3f}s)' \
                       ' |Net {bt.avg:.3f}s'.format(dt=data_time,
                                                    bt=batch_time)
        else:
            time_str = ''
        Bar.suffix = '{split}: [{0}][{1}/{2}] |Total {total:} |ETA {eta:}' \
                     '|Loss {loss.avg:.5f} |Acc {Acc.avg:.4f}' \
                     '{time_str}'.format(epoch, i, nIters, total=bar.elapsed_td,
                                         eta=bar.eta_td, loss=Loss, Acc=Acc,
                                         split=split, time_str=time_str)
        if opt.print_iter > 0:
            if i % opt.print_iter == 0:
                print('{}| {}'.format(opt.exp_id, Bar.suffix))
        else:
            bar.next()
        if opt.debug >= 2:
            gt = get_preds(target.cpu().numpy()) * 4
            pred = get_preds(output[-1]['hm'].detach().cpu().numpy()) * 4
            debugger = Debugger(ipynb=opt.print_iter > 0, edges=edges)
            img = (input[0].numpy().transpose(1, 2, 0) * std + mean) * 256
            img = img.astype(np.uint8).copy()
            debugger.add_img(img)
            debugger.add_mask(
                cv2.resize(target[0].numpy().max(axis=0),
                           (opt.input_w, opt.input_h)), img, 'target')
            debugger.add_mask(
                cv2.resize(output[-1]['hm'][0].detach().cpu().numpy().max(axis=0),
                           (opt.input_w, opt.input_h)), img, 'pred')
            debugger.add_point_2d(pred[0], (255, 0, 0))
            debugger.add_point_2d(gt[0], (0, 0, 255))
            debugger.show_all_imgs(pause=True)

    bar.finish()
    return {'loss': Loss.avg,
            'acc': Acc.avg,
            'time': bar.elapsed_td.total_seconds() / 60.}, preds
class MoveFilesFromStorageController:
    """Class that executes file moves from a direct ingest Google Cloud Storage bucket to the appropriate ingest
    bucket.
    """

    FILE_TO_MOVE_RE = \
        re.compile(r'^(processed_|unprocessed_|un)?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}:\d{6}(raw|ingest_view)?.*)')

    QUEUES_TO_PAUSE = {
        DIRECT_INGEST_SCHEDULER_QUEUE_V2,
        DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2,
        DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2
    }

    PAUSE_QUEUE_URL = 'https://cloudtasks.googleapis.com/v2/projects/{}/locations/us-east1/queues/{}:pause'

    PURGE_QUEUE_URL = 'https://cloudtasks.googleapis.com/v2/projects/{}/locations/us-east1/queues/{}:purge'

    CURL_POST_REQUEST_TEMPLATE = 'curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" {}'

    def __init__(self, project_id: str, region: str,
                 file_type_to_move: GcsfsDirectIngestFileType,
                 destination_file_type: GcsfsDirectIngestFileType,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool,
                 file_filter: Optional[str]):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if self.file_type_to_move != self.destination_file_type and \
                self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED:
            raise ValueError(
                'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED'
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )

    def run_move(self):
        """Main method of script - executes move, or runs a dry run of a move."""
        if self.dry_run:
            logging.info("Running in DRY RUN mode for region [%s]",
                         self.region)
        else:
            i = input(
                f"This will move [{self.region}] files in [{self.project_id}] that were uploaded starting on date"
                f"[{self.start_date_bound}] and ending on date [{self.end_date_bound}]. Type {self.project_id} "
                f"to continue: ")

            if i != self.project_id:
                return

        if self.dry_run:
            logging.info("DRY RUN: Would pause [%s] in project [%s]",
                         self.QUEUES_TO_PAUSE, self.project_id)
        else:
            i = input(f"Pausing queues {self.QUEUES_TO_PAUSE} in project "
                      f"[{self.project_id}] - continue? [y/n]: ")

            if i.upper() != 'Y':
                return

            self.pause_and_purge_queues()

        date_subdir_paths = self.get_date_subdir_paths()

        if self.dry_run:
            logging.info("DRY RUN: Found [%s] dates to move",
                         len(date_subdir_paths))
        else:
            i = input(f"Found [{len(date_subdir_paths)}] dates to move - "
                      f"continue? [y/n]: ")

            if i.upper() != 'Y':
                return

        thread_pool = ThreadPool(processes=12)
        files_to_move = self.collect_files_to_move(date_subdir_paths,
                                                   thread_pool)

        self.move_files(files_to_move, thread_pool)

        thread_pool.close()
        thread_pool.join()

        self.write_moves_to_log_file()

        if self.dry_run:
            logging.info(
                "DRY RUN: See results in [%s].\n"
                "Rerun with [--dry-run False] to execute move.",
                self.log_output_path)
        else:
            logging.info(
                "Move complete! See results in [%s].\n"
                "\nNext steps:"
                "\n1. (If doing a full re-ingest) Drop Google Cloud database for [%s]"
                "\n2. Resume queues here:", self.log_output_path,
                self.project_id)

            for queue_name in self.QUEUES_TO_PAUSE:
                logging.info("\t%s", self.queue_console_url(queue_name))

    def get_date_subdir_paths(self) -> List[str]:
        return gsutil_get_storage_subdirs_containing_file_types(
            storage_bucket_path=self.storage_bucket.abs_path(),
            file_type=self.file_type_to_move,
            upper_bound_date=self.end_date_bound,
            lower_bound_date=self.start_date_bound)

    def collect_files_to_move(self, date_subdir_paths: List[str],
                              thread_pool: ThreadPool) -> List[str]:
        """Searches the given list of directory paths for files directly in those directories that should be moved to
        the ingest directory and returns a list of string paths to those files.
        """
        msg_prefix = 'DRY_RUN: ' if self.dry_run else ''
        self.collect_progress = Bar(f"{msg_prefix}Gathering paths to move...",
                                    max=len(date_subdir_paths))
        collect_files_res = thread_pool.map(self.get_files_to_move_from_path,
                                            date_subdir_paths)

        if not self.collect_progress:
            raise ValueError('Progress bar should not be None')
        self.collect_progress.finish()

        return [f for sublist in collect_files_res for f in sublist]

    def move_files(self, files_to_move: List[str], thread_pool: ThreadPool):
        """Moves files at the given paths to the ingest directory, changing the prefix to 'unprocessed' as necessary.

        For the given list of file paths:

        files_to_move = [
            'storage_bucket/path/to/processed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv'
        ]

        Will run:
        gsutil mv
            gs://storage_bucket/path/to/processed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv \
            unprocessed_2019-09-24T09:01:20:039807_elite_offendersentenceterms.csv

        Note: Move order is not guaranteed - file moves are parallelized.
        """
        msg_prefix = 'DRY_RUN: ' if self.dry_run else ''
        self.move_progress = Bar(f"{msg_prefix}Moving files...",
                                 max=len(files_to_move))
        thread_pool.map(self.move_file, files_to_move)

        if not self.move_progress:
            raise ValueError('Progress bar should not be None')
        self.move_progress.finish()

    def queue_console_url(self, queue_name: str):
        """Returns the url to the GAE console page for a queue with a given name."""
        return f'https://console.cloud.google.com/cloudtasks/queue/{queue_name}?project={self.project_id}'

    def do_post_request(self, url: str):
        """Executes a googleapis.com curl POST request with the given url. """
        res = subprocess.Popen(self.CURL_POST_REQUEST_TEMPLATE.format(url),
                               shell=True,
                               stdout=subprocess.PIPE)
        stdout, _stderr = res.communicate()
        response = json.loads(stdout)
        if 'error' in response:
            raise ValueError(response['error'])

    def pause_queue(self, queue_name: str):
        """Posts a request to pause the queue with the given name."""
        logging.info("Pausing [%s] in [%s]", queue_name, self.project_id)
        self.do_post_request(
            self.PAUSE_QUEUE_URL.format(self.project_id, queue_name))

    def purge_queue(self, queue_name: str):
        """Posts a request to purge the queue with the given name."""
        logging.info("Purging [%s] in [%s]", queue_name, self.project_id)
        self.do_post_request(
            self.PURGE_QUEUE_URL.format(self.project_id, queue_name))

    def pause_and_purge_queues(self):
        """Pauses and purges Direct Ingest queues for the specified project."""
        for queue_name in self.QUEUES_TO_PAUSE:
            self.pause_queue(queue_name)
            self.purge_queue(queue_name)

    def get_files_to_move_from_path(self, gs_dir_path: str) -> List[str]:
        """Returns files directly in the given directory that should be moved back into the ingest directory.
        """
        file_paths = gsutil_ls(gs_dir_path)

        result = []
        for file_path in file_paths:
            _, file_name = os.path.split(file_path)
            if re.match(self.FILE_TO_MOVE_RE, file_name):
                if not self.file_filter or re.search(self.file_filter,
                                                     file_name):
                    result.append(file_path)
        with self.mutex:
            if self.collect_progress:
                self.collect_progress.next()
        return result

    def move_file(self, original_file_path: str):
        """Moves a file at the given path into the ingest directory, updating the name to always have an prefix of
        'unprocessed'. Logs the file move, which will later be written to a log file.

        If in dry_run mode, merely logs the move, but does not execute it.
        """
        new_file_path = self.build_moved_file_path(original_file_path)

        if not self.dry_run:
            gsutil_mv(original_file_path, new_file_path)

        with self.mutex:
            self.moves_list.append((original_file_path, new_file_path))
            if self.move_progress:
                self.move_progress.next()

    def build_moved_file_path(self, original_file_path: str) -> str:
        """Builds the desired path for the given file in the ingest bucket, changing the prefix to 'unprocessed' as is
        necessary.
        """

        path_as_unprocessed = to_normalized_unprocessed_file_path_from_normalized_path(
            original_file_path, file_type_override=self.destination_file_type)

        _, file_name = os.path.split(path_as_unprocessed)

        if not re.match(self.FILE_TO_MOVE_RE, file_name):
            raise ValueError(f"Invalid file name {file_name}")

        return os.path.join('gs://', self.ingest_bucket.abs_path(), file_name)

    def write_moves_to_log_file(self):
        self.moves_list.sort()
        with open(self.log_output_path, 'w') as f:
            if self.dry_run:
                template = "DRY RUN: Would move {} -> {}\n"
            else:
                template = "Moved {} -> {}\n"

            f.writelines(
                template.format(original_path, new_path)
                for original_path, new_path in self.moves_list)
def import_signal_mask(conn):
    """
    Export pictures of the syllable with fundamentals
    :param conn:
    :return:
    """
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cur.execute('SELECT id, name, maxfreq, dy FROM songdata s')

    songs_data = cur.fetchall()
    song_info = {}

    for song in songs_data:
        song_name = song['name']
        song_info[song_name] = (song['id'], song['maxfreq'], song['dy'])

    segments_info = Segment.objects \
        .filter(audio_file__name__in=song_info.keys()) \
        .values_list('id', 'audio_file__name', 'start_time_ms', 'end_time_ms')

    n = len(segments_info)
    bar = Bar('Importing segments ...', max=n)

    for seg_id, song_name, start, end in segments_info:
        if song_name not in song_info:
            continue
        song_id, nyquist, fbin = song_info[song_name]

        cur.execute('select starttime, endtime, songid from syllable where songid={} and starttime<={} and endtime>={}'
                    ' order by starttime'.format(song_id, start, end))
        syl_rows = cur.fetchall()

        if len(syl_rows) == 0:
            warning('Song #{} {} doesn\'t have a syllable at position {}:{}'.format(song_id, song_name, start, end))
            continue

        if len(syl_rows) > 1:
            warning('Song #{} {} has more than one syllable at position {}:{}. Db Syllable #{}'
                    .format(song_id, song_name, start, end, seg_id))

        for syl_idx, syl_row in enumerate(syl_rows):
            syl_starttime = syl_row['starttime']
            syl_endtime = syl_row['endtime']

            cur.execute('select starttime, timelength, fundfreq, gapbefore, gapafter, maxf, dy,'
                        'overallpeakfreq1, overallpeakfreq2 '
                        'from element where songid={} and starttime >= {} and (starttime + timelength) <= {}'
                        .format(song_id, syl_starttime, syl_endtime))
            el_rows = cur.fetchall()

            if len(el_rows) == 0:
                warning('Syllable #{} starttime={} endtime={} of song: "{}" doesn\'t enclose any syllable.'
                        .format(1, syl_starttime, syl_endtime, song_name))
                continue

            syl_starttime = el_rows[0]['starttime']
            syl_endtime = get_syllable_end_time(el_rows)

            if nyquist == 0:
                nyquist = el_rows[0]['maxf']
            if fbin == 0:
                fbin = el_rows[0]['dy']

            width = int(syl_endtime - syl_starttime) + 1
            height = int(nyquist / fbin)

            img_data_rgb = np.ones((height, width, 3), dtype=np.uint8) * 255

            syl_max_ff = 0
            syl_min_ff = 999999
            syl_combined_ff = None

            for el_idx, el in enumerate(el_rows):
                # signal = list(map(int, el['signal'].strip().split(' ')))
                fundfreq = np.array(el['fundfreq'].strip().split(
                    ' '), dtype='|S32').astype(np.float)
                el_max_ff = fundfreq[0]
                el_min_ff = fundfreq[1]

                # the first 4 numbers of fundfreq are: max, min, ? (no idea) and ? (no idea), so we ignore them
                fundfreq = fundfreq[4:]
                if el_idx == 0:
                    syl_combined_ff = fundfreq
                else:
                    syl_combined_ff = np.concatenate(
                        (syl_combined_ff, fundfreq))

                fundfreq = (fundfreq / nyquist * height).astype(np.int)

                i = 0
                ff_row_idx = 0
                while i < len(signal):
                    num_data = signal[i]
                    img_col_idx = signal[i + 1] - syl_starttime

                    # Draw the mask
                    for j in range(2, num_data, 2):
                        _signal_segment_end = signal[i + j]
                        _signal_segment_start = signal[i + j + 1]
                        img_data_rgb[_signal_segment_start:_signal_segment_end, img_col_idx, :] \
                            = COLOURS[el_idx % len(COLOURS)]

                    # Add the fundamental (red lines)
                    if ff_row_idx < len(fundfreq):
                        img_row_idx = height - fundfreq[ff_row_idx] - 1

                        img_row_idx_padded_low = max(0, img_row_idx - 2)
                        img_row_idx_padded_high = img_row_idx + 4 - (img_row_idx - img_row_idx_padded_low)
                        img_data_rgb[img_row_idx_padded_low:img_row_idx_padded_high, img_col_idx, :] = FF_COLOUR
                    ff_row_idx += 1
                    i += (num_data + 1)

                syl_max_ff = max(syl_max_ff, el_max_ff)
                syl_min_ff = min(syl_min_ff, el_min_ff)
            syl_mean_ff = np.mean(syl_combined_ff)

            Segment.objects.filter(id=seg_id).update(mean_ff=syl_mean_ff)
            Segment.objects.filter(id=seg_id).update(max_ff=syl_max_ff)
            Segment.objects.filter(id=seg_id).update(min_ff=syl_min_ff)

            img = Image.fromarray(img_data_rgb)
            thumbnail_width = int(img.size[0])
            thumbnail_height = int(img.size[1] * 0.3)

            img = img.resize((thumbnail_width, thumbnail_height))

            if syl_idx > 0:
                warning('Syl_idx > 0')
                file_path = spect_mask_path('{}_{}'.format(seg_id, syl_idx))
            else:
                file_path = spect_mask_path(seg_id)
            ensure_parent_folder_exists(file_path)

            img.save(file_path, format='PNG')
        bar.next()
    bar.finish()
Beispiel #37
0
class MoveFilesToDeprecatedController:
    """Class with functionality to move files to deprecated folder with proper formatting."""
    def __init__(
        self,
        file_type: GcsfsDirectIngestFileType,
        region_code: str,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        project_id: str,
        file_filter: Optional[str],
    ):
        self.file_type = file_type
        self.region_code = region_code
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter
        self.project_id = project_id
        self.region_storage_dir_path_for_file_type = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_code,
                    SystemLevel.STATE,
                    self.file_type,
                    project_id=self.project_id,
                )))
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}"
            f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
        self.mutex = threading.Lock()
        self.move_list: List[Tuple[str, str]] = []
        self.move_progress: Optional[Bar] = None

    def run(self) -> None:
        """Main function that will execute the move to deprecated."""

        # TODO(#3666): Update this script to make updates to our Operations db and BigQuery (if necessary).
        #  For now we print these messages to check if appropriate data has been deleted from operations db.
        if self.dry_run:
            if self.file_type == GcsfsDirectIngestFileType.RAW_DATA:
                logging.info(
                    "[DRY RUN] All associated rows from our postgres table `direct_ingest_raw_file_metadata` "
                    "and BigQuery dataset `%s_raw_data` must be deleted before moving these "
                    "files to a deprecated location. Make sure you have done this before moving these files.",
                    self.region_code,
                )

            elif self.file_type == GcsfsDirectIngestFileType.INGEST_VIEW:
                logging.info(
                    "[DRY RUN] All associated rows from our postgres table `direct_ingest_ingest_file_"
                    "metadata` must be deleted before moving these files to a deprecated location. "
                    "Make sure you have done this before moving these files.")

        else:
            if self.file_type == GcsfsDirectIngestFileType.RAW_DATA:
                i = input(
                    "All associated rows from our postgres table `direct_ingest_raw_file_metadata` "
                    f"and BigQuery dataset `{self.region_code}_raw_data` must be deleted before moving these "
                    "files to a deprecated location.\n Have you already done so? [y/n]: "
                )

                if i.upper() != "Y":
                    return

            elif self.file_type == GcsfsDirectIngestFileType.INGEST_VIEW:
                i = input(
                    "All associated rows from our postgres table `direct_ingest_ingest_file_metadata` "
                    "must be deleted before moving these files to a deprecated location.\n"
                    "Have you already done so? [y/n]: ")

                if i.upper() != "Y":
                    return

        destination_dir_path = os.path.join(
            self.region_storage_dir_path_for_file_type.abs_path(),
            "deprecated",
            f"deprecated_on_{date.today()}",
            f"{str(self.file_type.value)}/",
        )

        if self.dry_run:
            logging.info(
                "[DRY RUN] Moving files from [%s] to [%s]",
                self.region_storage_dir_path_for_file_type.abs_path(),
                destination_dir_path,
            )

        else:

            i = input(
                f"Moving files from [{self.region_storage_dir_path_for_file_type.abs_path()}] to "
                f"[{destination_dir_path}] - continue? [y/n]: ")

            if i.upper() != "Y":
                return

        files_to_move = self._get_files_to_move()

        if self.dry_run:
            logging.info("[DRY RUN] Found [%d] files to move",
                         len(files_to_move))

        else:
            i = input(f"Found [{len(files_to_move)}] files to move - "
                      f"continue? [y/n]: ")

            if i.upper() != "Y":
                return

        self._execute_move(files_to_move)
        self._write_move_to_log_file()

        if self.dry_run:
            logging.info(
                "DRY RUN: See results in [%s].\n"
                "Rerun with [--dry-run False] to execute move.",
                self.log_output_path,
            )
        else:
            logging.info("Move complete! See results in [%s].\n",
                         self.log_output_path)

    def _get_files_to_move(self) -> List[str]:
        """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
        subdirs = gsutil_get_storage_subdirs_containing_file_types(
            storage_bucket_path=GcsfsDirectoryPath.from_bucket_and_blob_name(
                self.region_storage_dir_path_for_file_type.bucket_name,
                self.region_code).abs_path(),
            file_type=self.file_type,
            lower_bound_date=self.start_date_bound,
            upper_bound_date=self.end_date_bound,
        )
        result = []
        for subdir_path in subdirs:
            from_paths = gsutil_ls(f"{subdir_path}*.csv")
            for from_path in from_paths:
                _, file_name = os.path.split(from_path)
                if re.match(INGESTED_FILE_REGEX, file_name):
                    if not self.file_filter or re.search(
                            self.file_filter, file_name):
                        result.append(from_path)
        return result

    def _write_move_to_log_file(self) -> None:
        self.move_list.sort()
        with open(self.log_output_path, "w") as f:
            if self.dry_run:
                template = "DRY RUN: Would move {} -> {}\n"
            else:
                template = "Moved {} -> {}\n"

            f.writelines(
                template.format(original_path, new_path)
                for original_path, new_path in self.move_list)

    def _move_files_for_date(self, from_uri: str) -> None:
        """Function that loops through each list of files to move and moves them to the deprecated folder
        in accordance with the date they were received and the date they were deprecated."""
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")
        to_uri = os.path.join(
            "gs://",
            self.region_storage_dir_path_for_file_type.bucket_name,
            self.region_code,
            "deprecated",
            f"deprecated_on_{date.today()}",
            str(self.file_type.value),
            new_date_format,
            curr_gcsfs_file_path.file_name,
        )
        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()

    def _execute_move(self, files_to_move: List[str]) -> None:
        self.move_progress = Bar("Moving files to deprecated...",
                                 max=len(files_to_move))

        thread_pool = ThreadPool(processes=12)
        thread_pool.map(self._move_files_for_date, files_to_move)
        self.move_progress.finish()
Beispiel #38
0
def run():
    # calculate class rank
    StudentExamRecord.objects.filter(sub_exam__course_id=60).delete()
    StudentExamRecord.objects.filter(class_rank__gt=0).update(class_rank=0)
    class_exam_records = ClassExamRecord.objects.filter(
        attend_count__gt=0,
        stu_class__isnull=False
    ).order_by(
        'stu_class_id',
        'sub_exam__exam_id'
    ).exclude()
    bar = Bar('Class Ranking', max=len(class_exam_records))
    exam_id = None
    total_score_counter = {}
    for class_exam_record in class_exam_records:
        bar.next()
        if not exam_id:
            exam_id = class_exam_record.sub_exam.exam_id

        if exam_id != class_exam_record.sub_exam.exam_id:
            sorted_records = sorted(total_score_counter.items(), key=lambda d: d[1], reverse=True)

            for index, record in enumerate(sorted_records):
                student_in_db = Student.objects.get(id=record[0])
                sub_exam_in_db = SubExam.objects.get(exam_id=exam_id, course_id=60)
                StudentExamRecord.objects.create(
                    student=student_in_db,
                    sub_exam=sub_exam_in_db,
                    score=record[1],
                    class_rank=index + 1
                )
            total_score_counter = {}
            exam_id = class_exam_record.sub_exam.exam_id

        stu_class = class_exam_record.stu_class
        students = stu_class.studentrecord_set.values(
            'student_id'
        ).distinct().values_list('student_id', flat=True)

        sub_exam_id = class_exam_record.sub_exam_id

        student_exam_records = StudentExamRecord.objects.filter(
            sub_exam_id=sub_exam_id,
            student_id__in=students,
            score__gte=0
        ).order_by('deng_di')

        for index, student_record in enumerate(student_exam_records):
            student_record.class_rank = index + 1
            student_record.save()

            student_id = student_record.student_id
            if student_id not in total_score_counter:
                total_score_counter[student_id] = student_record.score
                continue
            total_score_counter[student_id] += student_record.score

    if exam_id:
        bar.max += 1
        sorted_records = sorted(total_score_counter.items(), key=lambda d: d[1], reverse=True)

        for index, record in enumerate(sorted_records):
            student_in_db = Student.objects.get(id=record[0])
            sub_exam_in_db = SubExam.objects.get(exam_id=exam_id, course_id=60)
            StudentExamRecord.objects.create(
                student=student_in_db,
                sub_exam=sub_exam_in_db,
                score=record[1],
                class_rank=index + 1
            )
    bar.next()
    bar.finish()
Beispiel #39
0
def train(model,
          data,
          batch_size=128,
          learning_rate=FLAGS.learning_rate,
          log_dir='./log',
          checkpoint_dir='./checkpoint',
          num_epochs=-1):

    # tf Graph input
    with tf.device('/cpu:0'):
        with tf.name_scope('data'):
            x, yt = data.generate_batches(batch_size)

        global_step = tf.get_variable('global_step',
                                      shape=[],
                                      dtype=tf.int64,
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
    if FLAGS.gpu:
        device_str = '/gpu:' + str(FLAGS.device)
    else:
        device_str = '/cpu:0'
    with tf.device(device_str):
        y = model(x, is_training=True)
        # Define loss and optimizer
        with tf.name_scope('objective'):
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt,
                                                               logits=y))
            accuracy = tf.reduce_mean(
                tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32))
        opt = tf.contrib.layers.optimize_loss(
            loss,
            global_step,
            learning_rate,
            'Adam',
            gradient_noise_scale=None,
            gradient_multipliers=None,
            clip_gradients=None,  #moving_average_decay=0.9,
            learning_rate_decay_fn=learning_rate_decay_fn,
            update_ops=None,
            variables=None,
            name=None)
        #grads = opt.compute_gradients(loss)
        #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # loss_avg

    ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,
                                            global_step,
                                            name='average')
    ema_op = ema.apply([loss, accuracy] + tf.trainable_variables())
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op)

    loss_avg = ema.average(loss)
    tf.summary.scalar('loss/training', loss_avg)
    accuracy_avg = ema.average(accuracy)
    tf.summary.scalar('accuracy/training', accuracy_avg)

    check_loss = tf.check_numerics(loss, 'model diverged: loss->nan')
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss)
    updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies([opt]):
        train_op = tf.group(*updates_collection)

    if FLAGS.summary:
        add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg],
                      activation_list=tf.get_collection(
                          tf.GraphKeys.ACTIVATIONS),
                      var_list=tf.trainable_variables())
        # grad_list=grads)

    summary_op = tf.summary.merge_all()

    # Configure options for session
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        log_device_placement=False,
        allow_soft_placement=True,
        gpu_options=gpu_options,
    ))
    saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoint is restored.')
    else:
        print('No checkpoint file found')
        sess.run(tf.global_variables_initializer())
    #sess.run(tf.global_variables_initializer())

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    num_batches = data.size[0] / batch_size
    summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph)
    epoch = 0

    print('num of trainable paramaters: %d' %
          count_params(tf.trainable_variables()))
    while epoch != num_epochs:
        epoch += 1
        curr_step = 0
        # Initializing the variables

        #with tf.Session() as session:
        #    print(session.run(ww))

        print('Started epoch %d' % epoch)
        bar = Bar('Training',
                  max=num_batches,
                  suffix='%(percent)d%% eta: %(eta)ds')
        while curr_step < data.size[0]:
            _, loss_val = sess.run([train_op, loss])
            curr_step += FLAGS.batch_size
            bar.next()

        step, acc_value, loss_value, summary = sess.run(
            [global_step, accuracy_avg, loss_avg, summary_op])
        saver.save(sess,
                   save_path=checkpoint_dir + '/model.ckpt',
                   global_step=global_step)
        bar.finish()
        print('Finished epoch %d' % epoch)
        print('Training Accuracy: %.3f' % acc_value)
        print('Training Loss: %.3f' % loss_value)

        test_acc, test_loss = evaluate(model,
                                       FLAGS.dataset,
                                       batch_size=batch_size,
                                       checkpoint_dir=checkpoint_dir)  # ,
        # log_dir=log_dir)
        print('Test Accuracy: %.3f' % test_acc)
        print('Test Loss: %.3f' % test_loss)

        summary_out = tf.Summary()
        summary_out.ParseFromString(summary)
        summary_out.value.add(tag='accuracy/test', simple_value=test_acc)
        summary_out.value.add(tag='loss/test', simple_value=test_loss)
        summary_writer.add_summary(summary_out, step)
        summary_writer.flush()

    # When done, ask the threads to stop.
    coord.request_stop()
    coord.join(threads)
    coord.clear_stop()
    summary_writer.close()
Beispiel #40
0
def excel_creator(dic_list, new=True):
    row_enumerate = 2
    print("Start to create xlx file")

    list_of_items = []
    if new:
        book = Workbook()
        sheet = book.active
    else:
        # dic_list=dic_list[:-1]
        book = load_workbook("Med_list.xlsx")
        try:
            shl = (book.sheetnames)
            if len(shl) > 0:
                for h in shl:
                    if h != "Sheet":
                        book.remove(book[h])
        except:
            pass

        book.create_sheet("Sheet_row")
        sheet = book["Sheet_row"]

    if new:
        sheet.append([
            "номер п/п", "Номер РКИ", "Дата создания РКИ", "Наименование ЛП",
            "Организация, проводящая КИ", "Страна разраб-ка",
            "Организация, привлеченная разработчиком ЛП", "Начало (дата)",
            "Окончание (дата)", "№ протокола", "Протокол", "Фаза КИ", "Вид КИ",
            "Колич. мед. орг-й", "Колич. пациент.", "Области применения",
            "Состояние",
            "Перечень медицинских организаций, в которых предполагается проведение клинических исследований"
        ])

    for index, d in enumerate(dic_list):
        for i in range(d["cells"]):
            step = d["cells"]
            list_of_items.append((d["n_n"]))
            list_of_items.append(d["rki"])
            list_of_items.append(d["date"])
            list_of_items.append(d["name_lp"])
            list_of_items.append(d["organization"])
            list_of_items.append(d["country"])
            list_of_items.append(d["organization_lp"])
            list_of_items.append(d["date_start"])
            list_of_items.append(d["date_end"])
            list_of_items.append(d["n_protokol"])
            list_of_items.append(d["protokol"])
            list_of_items.append(d["phase"])
            list_of_items.append(d["view"])
            list_of_items.append(d["n_orgs"])
            list_of_items.append(d["n_patient"])
            list_of_items.append(d["type"])
            list_of_items.append(d["status"])
            # list_of_items.append(func(d["clinc_listsl"]))
            list_of_items.append((d["clinc_listsl"][i]))
            list_of_items.append(d["cells"])
            if d["rki"] != "":
                sheet.append(list_of_items)

            list_of_items = []

    r = len(sheet['A'])
    print("Create Sheet_row")

    if new == False:
        sheet_d = book["Sheet_row"]
        sheet = book["Sheet"]
        sheet.insert_rows(idx=2, amount=r)

        for ind, row in enumerate(sheet_d.rows):
            for col, k in enumerate(row):
                sheet.cell(row=ind + 2, column=col + 1).value = k.value

    book.create_sheet("KI")
    sheet_ki = book["KI"]

    print("Create KI")

    for inx, row in enumerate(sheet.rows):
        for cl, t in enumerate(row):
            sheet_ki.cell(row=inx + 1, column=cl + 1).value = t.value
        if sheet_ki.cell(row=inx + 1, column=cl + 1).value == 1:
            sheet_ki.row_dimensions[inx + 1].height = 60

        # if step == 2:
        #     sheet.row_dimensions[row_enumerate].height = 30
        #     sheet.row_dimensions[row_enumerate + 1].height = 30

    print("Alignment 1 iter KI")
    bar = Bar('Processing', max=len(sheet_ki['A']))

    checker = sheet_ki.cell(row=2, column=19).value
    checker2 = sheet_ki.cell(row=2, column=2).value
    for liner, i in enumerate(range(2, len(sheet_ki['A']) + 1)):
        if sheet_ki.cell(row=i, column=19).value == checker and sheet_ki.cell(
                row=i, column=2).value == checker2:
            pass
        else:

            for col in range(1, 18):
                sheet_ki.merge_cells(start_row=i - checker,
                                     start_column=col,
                                     end_row=i - 1,
                                     end_column=col)
            checker = sheet_ki.cell(row=i, column=19).value
            checker2 = sheet_ki.cell(row=2, column=2).value

        bar.next()

    bar.finish()

    for col in range(1, 18):
        sheet_ki.merge_cells(start_row=i - checker + 1,
                             start_column=col,
                             end_row=i,
                             end_column=col)

    print("Alignment2 iter KI")

    bar = Bar('Processing', max=len(sheet_ki['A']))

    for row in sheet_ki.rows:
        for k in row:
            # print(row[0:-1])
            k.alignment = Alignment(vertical="center",
                                    horizontal="center",
                                    wrapText=True)
            k.font = Font(size="9")
        bar.next()
    bar.finish()

    print("Alignment 3 iter KI")
    bar = Bar('Processing', max=len(sheet_ki['A']))
    for row in sheet_ki.rows:
        for k in row[-2:]:
            k.alignment = Alignment(vertical="center",
                                    horizontal="left",
                                    wrapText=True)

        bar.next()
    bar.finish()

    sheet_ki.column_dimensions[get_column_letter(4)].width = 25
    sheet_ki.column_dimensions[get_column_letter(5)].width = 35
    sheet_ki.column_dimensions[get_column_letter(7)].width = 50
    sheet_ki.column_dimensions[get_column_letter(11)].width = 60
    sheet_ki.column_dimensions[get_column_letter(16)].width = 11
    sheet_ki.column_dimensions[get_column_letter(17)].width = 11
    sheet_ki.column_dimensions[get_column_letter(18)].width = 1000

    book.save("Med_list.xlsx")
    print("file was done")
Beispiel #41
0
def upload_self(api_base_url='',
                token='',
                source_file='',
                dest_path='',
                chunksize=10247680):
    """str, str, str, int, int->Bool

    Upload a file via the API, instead of the SDK.

    Ref: https://dev.onedrive.com/items/upload_post.htm
    """
    ## get upload URL
    if not dest_path.endswith('/'):
        dest_path += '/'

    # Prepare API call
    dest_path = path_to_remote_path(dest_path) + '/' + path_to_name(
        source_file)
    info_json = json.dumps({
        'item': {
            '@name.conflictBehavior': 'rename',
            'name': path_to_name(source_file)
        }
    })

    api_url = api_base_url + 'drive/root:{dest_path}:/upload.createSession'.format(
        dest_path=dest_path)

    req = requests.post(api_url,
                        data=info_json,
                        headers={
                            'Authorization':
                            'bearer {access_token}'.format(access_token=token),
                            'content-type':
                            'application/json'
                        })

    if req.status_code > 201:
        print(req.json()['error']['message'])
        return False

    req = convert_utf8_dict_to_dict(req.json())

    uploadUrl = req['uploadUrl']

    # filesize cannot > 10GiB
    file_size = os.path.getsize(source_file)

    # print(file_size)

    range_list = [[i, i + chunksize - 1]
                  for i in range(0, file_size, chunksize)]
    range_list[-1][-1] = file_size - 1

    # Upload with a progress bar
    bar = Bar('Uploading',
              max=len(range_list),
              suffix='%(percent).1f%% - %(eta)ds')
    bar.next()  # nessesery to init the Bar

    # Session reuse when uploading, hopefully will kill some overhead
    requests_session = requests.Session()

    for i in range_list:
        upload_one_piece(uploadUrl=uploadUrl,
                         token=token,
                         source_file=source_file,
                         range_this=i,
                         file_size=file_size,
                         requests_session=requests_session)
        bar.next()

    bar.finish()

    return True
Beispiel #42
0
    def build_model(self):
        image_batch = layers.data(name='image_batch',
                                  shape=[-1, 1, 28, 28],
                                  dtype='float32')
        label_batch = layers.data(name='label_batch',
                                  shape=[-1, 1],
                                  dtype='int64')
        noise = layers.data(name='noise',
                            shape=[-1, self.cfg.latent_size],
                            dtype='float32')
        sampled_labels = layers.data(name='sampled_labels',
                                     shape=[-1, 1],
                                     dtype='int64')
        x = layers.data(name='x', shape=[-1, 1, 28, 28], dtype='float32')
        y = layers.data(name='y', shape=[-1, 1], dtype='float32')
        aux_y = layers.data(name='aux_y', shape=[-1, 1], dtype='int64')
        trick = layers.data(name='trick', shape=[-1, 1], dtype='float32')

        g_train = GTrain(sampled_labels, noise, trick, self.cfg)
        d_train = DTrain(x, y, aux_y, self.cfg)

        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())

        g_train_prog = fluid.CompiledProgram(g_train.program)
        d_train_prog = fluid.CompiledProgram(d_train.program)

        train_history = defaultdict(list)
        test_history = defaultdict(list)

        for epoch in range(1, self.cfg.epochs + 1):
            print('Epoch {}/{}'.format(epoch, self.cfg.epochs))

            num_batches = int(np.ceil(60000 / float(self.cfg.batch_size)))
            progress_bar = Bar('Training', max=num_batches)

            epoch_gen_loss = []
            epoch_disc_loss = []

            train_reader = paddle.batch(paddle.reader.shuffle(mnist.train(),
                                                              buf_size=60000),
                                        batch_size=self.cfg.batch_size,
                                        drop_last=True)
            test_reader = mnist.test()

            step = 0
            for i, data in enumerate(train_reader()):

                image_batch = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
                label_batch = np.array([[x[1]] for x in data]).astype('int64')

                if len(image_batch) != self.cfg.batch_size:
                    continue

                # generate a new batch of noise
                noise_np = np.random.uniform(
                    -1, 1, (self.cfg.batch_size,
                            self.cfg.latent_size)).astype('float32')

                # sample some labels from p_c
                sampled_labels_np = np.random.randint(
                    0, self.cfg.num_classes,
                    self.cfg.batch_size).astype('int64')
                sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1)

                # generate a batch of fake images, using the generated labels as
                # a conditioner. We reshape the sampled labels to be
                # (self.cfg.batch_size, 1) so that we can feed them into the
                # embedding layer as a length one sequence

                generated_images = exe.run(g_train.infer_program,
                                           feed={
                                               'sampled_labels':
                                               sampled_labels_np,
                                               'noise': noise_np
                                           },
                                           fetch_list=[g_train.fake_img])[0]

                x_np = np.concatenate((image_batch, generated_images))

                # use one-sided soft real/fake labels
                # Salimans et al., 2016
                # https://arxiv.org/pdf/1606.03498.pdf (Section 3.4)
                soft_zero, soft_one = 0, 0.95
                y_np = np.array([[soft_one]] * len(image_batch) +
                                [[soft_zero]] *
                                len(image_batch)).astype('float32')
                aux_y_np = np.concatenate((label_batch, sampled_labels_np),
                                          axis=0)

                # see if the discriminator can figure itself out...
                epoch_disc_loss.append(
                    exe.run(d_train_prog,
                            feed={
                                'x': x_np,
                                'y': y_np,
                                'aux_y': aux_y_np
                            },
                            fetch_list=[d_train.loss])[0])

                # make new noise. we generate 2 * batch size here such that we have
                # the generator optimize over an identical number of images as the
                # discriminator

                noise_np = np.random.uniform(
                    -1, 1, (2 * self.cfg.batch_size,
                            self.cfg.latent_size)).astype('float32')
                sampled_labels_np = np.random.randint(
                    0, self.cfg.num_classes,
                    2 * self.cfg.batch_size).astype('int64')
                sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1)

                # we want to train the generator to trick the discriminator
                # For the generator, we want all the {fake, not-fake} labels to say
                # not-fake
                trick_np = np.array([[soft_one]] * 2 *
                                    self.cfg.batch_size).astype('float32')

                epoch_gen_loss.append(
                    exe.run(g_train_prog,
                            feed={
                                'sampled_labels': sampled_labels_np,
                                'noise': noise_np,
                                'trick': trick_np
                            },
                            fetch_list=[g_train.loss])[0])

                step += 1
                progress_bar.next()
            progress_bar.finish()

            print('Testing for epoch {}'.format(epoch))

            # evaluate the testing loss here

            # generate a new batch of noise
            noise_np = np.random.uniform(
                -1, 1,
                (self.cfg.test_size, self.cfg.latent_size)).astype('float32')

            # sample some labels from p_c and generate images from them
            sampled_labels_np = np.random.randint(
                0, self.cfg.num_classes, self.cfg.test_size).astype('int64')
            sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1)

            generated_images = exe.run(g_train.infer_program,
                                       feed={
                                           'sampled_labels': sampled_labels_np,
                                           'noise': noise_np
                                       },
                                       fetch_list=[g_train.fake_img])[0]

            x_test, y_test = [], []
            for data in test_reader():
                x_test.append(np.reshape(data[0], [1, 28, 28]))
                y_test.append([data[1]])
                if len(x_test) >= self.cfg.test_size:
                    break
            x_test = np.array(x_test).astype('float32')
            y_test = np.array(y_test).astype('int64')

            x_np = np.concatenate((x_test, generated_images))
            y_np = np.array([[1]] * self.cfg.test_size +
                            [[0]] * self.cfg.test_size).astype('float32')
            aux_y_np = np.concatenate((y_test, sampled_labels_np), axis=0)

            # see if the discriminator can figure itself out...
            discriminator_test_loss = exe.run(
                d_train.infer_program,
                feed={
                    'x': x_np,
                    'y': y_np,
                    'aux_y': aux_y_np
                },
                fetch_list=[d_train.unweighted_loss])[0][0]

            discriminator_train_loss = np.mean(np.array(epoch_disc_loss))

            # make new noise
            noise_np = np.random.uniform(
                -1, 1, (2 * self.cfg.test_size,
                        self.cfg.latent_size)).astype('float32')
            sampled_labels_np = np.random.randint(
                0, self.cfg.num_classes,
                2 * self.cfg.test_size).astype('int64')
            sampled_labels_np = np.expand_dims(sampled_labels_np, axis=1)

            trick_np = np.array([[1]] * 2 *
                                self.cfg.test_size).astype('float32')

            generated_images = exe.run(g_train.infer_program,
                                       feed={
                                           'sampled_labels': sampled_labels_np,
                                           'noise': noise_np
                                       },
                                       fetch_list=[g_train.fake_img])[0]
            generator_test_loss = exe.run(d_train.infer_program,
                                          feed={
                                              'x': generated_images,
                                              'y': trick_np,
                                              'aux_y': sampled_labels_np
                                          },
                                          fetch_list=[d_train.unweighted_loss
                                                      ])[0][0]

            generator_train_loss = np.mean(np.array(epoch_gen_loss))

            # generate an epoch report on performance
            train_history['generator'].append(generator_train_loss)
            train_history['discriminator'].append(discriminator_train_loss)

            test_history['generator'].append(generator_test_loss)
            test_history['discriminator'].append(discriminator_test_loss)

            print('train g loss', generator_train_loss)
            print('train d loss', discriminator_train_loss)
            print('test g loss', generator_test_loss)
            print('test d loss', discriminator_test_loss)

            # generate some digits to display
            num_rows = 4
            noise_np = np.tile(
                np.random.uniform(-1, 1, (num_rows, self.cfg.latent_size)),
                (self.cfg.num_classes, 1)).astype('float32')

            sampled_labels_np = np.array([[i] * num_rows
                                          for i in range(self.cfg.num_classes)
                                          ]).reshape(-1, 1).astype('int64')

            generated_images = exe.run(g_train.infer_program,
                                       feed={
                                           'sampled_labels': sampled_labels_np,
                                           'noise': noise_np
                                       },
                                       fetch_list=[g_train.fake_img])[0]

            def save_images(generated_images, epoch):
                for i in range(len(generated_images)):
                    fname = './data/image_epoch_%d_%d.jpeg' % (epoch, i)
                    img = np.array(
                        generated_images[i]).astype('float32').reshape(
                            (28, 28))
                    img = img * 127.5 + 127.5
                    img = np.clip(img, 0, 255).astype('uint8')
                    img = Image.fromarray(img, 'L')
                    img.save(fname, format='JPEG')

            save_images(generated_images, epoch)

        with open('acgan-history.pkl', 'wb') as f:
            pickle.dump({'train': train_history, 'test': test_history}, f)
def verify_producer_performance(with_dr_cb=True):
    """ Time how long it takes to produce and delivery X messages """
    conf = {'bootstrap.servers': bootstrap_servers}

    p = confluent_kafka.Producer(**conf)

    topic = 'test'
    msgcnt = 1000000
    msgsize = 100
    msg_pattern = 'test.py performance'
    msg_payload = (msg_pattern * int(msgsize / len(msg_pattern)))[0:msgsize]

    dr = MyTestDr(silent=True)

    t_produce_start = time.time()
    msgs_produced = 0
    msgs_backpressure = 0
    print('# producing %d messages to topic %s' % (msgcnt, topic))

    if with_progress:
        bar = Bar('Producing', max=msgcnt)
    else:
        bar = None

    for i in range(0, msgcnt):
        try:
            if with_dr_cb:
                p.produce('test', value=msg_payload, callback=dr.delivery)
            else:
                p.produce('test', value=msg_payload)
        except BufferError as e:
            # Local queue is full (slow broker connection?)
            msgs_backpressure += 1
            if bar is not None and (msgs_backpressure % 1000) == 0:
                bar.next(n=0)
            p.poll(0)
            continue

        if bar is not None and (msgs_produced % 5000) == 0:
            bar.next(n=5000)
        msgs_produced += 1
        p.poll(0)

    t_produce_spent = time.time() - t_produce_start

    bytecnt = msgs_produced * msgsize

    if bar is not None:
        bar.finish()

    print('# producing %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \
          (msgs_produced, bytecnt / (1024*1024), t_produce_spent,
           msgs_produced / t_produce_spent,
           (bytecnt/t_produce_spent) / (1024*1024)))
    print('# %d messages not produce()d due to backpressure (local queue full)' % msgs_backpressure)

    print('waiting for %d/%d deliveries' % (len(p), msgs_produced))
    # Wait for deliveries
    p.flush()
    t_delivery_spent = time.time() - t_produce_start


    print('# producing %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \
          (msgs_produced, bytecnt / (1024*1024), t_produce_spent,
           msgs_produced / t_produce_spent,
           (bytecnt/t_produce_spent) / (1024*1024)))

    # Fake numbers if not using a dr_cb
    if not with_dr_cb:
        print('# not using dr_cb')
        dr.msgs_delivered = msgs_produced
        dr.bytes_delivered = bytecnt

    print('# delivering %d messages (%.2fMb) took %.3fs: %d msgs/s, %.2f Mb/s' % \
          (dr.msgs_delivered, dr.bytes_delivered / (1024*1024), t_delivery_spent,
           dr.msgs_delivered / t_delivery_spent,
           (dr.bytes_delivered/t_delivery_spent) / (1024*1024)))
    print('# post-produce delivery wait took %.3fs' % \
          (t_delivery_spent - t_produce_spent))
def verify_consumer_performance():
    """ Verify Consumer performance """

    conf = {'bootstrap.servers': bootstrap_servers,
            'group.id': uuid.uuid1(),
            'session.timeout.ms': 6000,
            'default.topic.config': {
                'auto.offset.reset': 'earliest'
            }}

    c = confluent_kafka.Consumer(**conf)

    def my_on_assign (consumer, partitions):
        print('on_assign:', len(partitions), 'partitions:')
        for p in partitions:
            print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset))
        consumer.assign(partitions)

    def my_on_revoke (consumer, partitions):
        print('on_revoke:', len(partitions), 'partitions:')
        for p in partitions:
            print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset))
        consumer.unassign()

    c.subscribe(["test"], on_assign=my_on_assign, on_revoke=my_on_revoke)

    max_msgcnt = 1000000
    bytecnt = 0
    msgcnt = 0

    print('Will now consume %d messages' % max_msgcnt)

    if with_progress:
        bar = Bar('Consuming', max=max_msgcnt,
                  suffix='%(index)d/%(max)d [%(eta_td)s]')
    else:
        bar = None

    while True:
        # Consume until EOF or error

        msg = c.poll(timeout=20.0)
        if msg is None:
            raise Exception('Stalled at %d/%d message, no new messages for 20s' %
                            (msgcnt, max_msgcnt))

        if msg.error():
            if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF:
                # Reached EOF for a partition, ignore.
                continue
            else:
                raise confluent_kafka.KafkaException(msg.error())


        bytecnt += len(msg)
        msgcnt += 1

        if bar is not None and (msgcnt % 10000) == 0:
            bar.next(n=10000)

        if msgcnt == 1:
            t_first_msg = time.time()
        if msgcnt >= max_msgcnt:
            break

    if bar is not None:
        bar.finish()

    if msgcnt > 0:
        t_spent = time.time() - t_first_msg
        print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % \
              (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent,
               (bytecnt / t_spent) / (1024*1024)))

    print('closing consumer')
    c.close()
Beispiel #45
0
def download_data(download_urls: list, area: str, driver_path: str, keys: dict,
                  outdir: str):
    """
    Function to instantiate web driver, stuff credentials, and repeately hit download urls
    """

    # Define options for web driver
    chrome_options = webdriver.ChromeOptions()

    # Define download directory as outdir
    prefs = {"download.default_directory": outdir}

    # Apply options to chrome driver
    chrome_options.add_experimental_option("prefs", prefs)

    # Instantiate web driver
    driver = webdriver.Chrome(executable_path=driver_path,
                              chrome_options=chrome_options)

    # Login url for Geoinsights platform
    geoinsights_url = "https://www.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2Fgeoinsights-portal%2F"

    # Access login url with webdriver
    driver.get(geoinsights_url)

    # Pause for page load (and cookie acceptance)
    time.sleep(2)

    # Try to accept cookies. On failure, pass
    try:

        driver.find_element_by_xpath('//*[@id="u_0_h"]').click()

    except Exception:

        pass

    # Add username in username form field
    driver.find_element_by_xpath('//*[@id="email"]').send_keys(keys["email"])

    # Add password in password form field
    driver.find_element_by_xpath('//*[@id="pass"]').send_keys(keys["password"])

    # Click login button
    driver.find_element_by_xpath('//*[@id="loginbutton"]').click()

    # Start download bar
    print("\n\n---------------------")
    bar = Bar("Downloading", max=len(download_urls))

    # For each download url, download dataset
    for i, url in enumerate(download_urls):

        # Get time of download start
        download_start = datetime.timestamp(datetime.now())

        # Access download url
        driver.get(url["url"])

        # Wait for file to be downloaded
        latest_file = wait_for_download(download_start, outdir)

        # Rename file with formatted file name
        rename_file(latest_file, outdir, area, url["date"])

        # Update progress bar
        bar.next()

    # Close progress bar
    bar.finish()
    def run(self):
        self.hdf5_group = self.hdf5_file  # TODO: split into train, val and test set

        prior_occ_dem = None
        progress_bar = None
        sample_idx = 0

        self.logger.info("Start loading first chunk of msgpack")

        for chunk_idx, chunk in enumerate(self.unpacker):
            self.logger.info(f"Msgpack chunk {chunk_idx} is loaded")

            occ_dem_msgs = chunk["/ga_slam.localElevationMapMean"]
            occ_data_um_msgs = chunk["/ga_slam.localElevationMapVariance"]
            gt_dem_msgs = chunk["/ga_slam.globalElevationMapMean"]
            gt_data_um_msgs = chunk["/ga_slam.globalElevationMapVariance"]

            for msg in zip(occ_dem_msgs, occ_data_um_msgs, gt_dem_msgs,
                           gt_data_um_msgs):
                occ_dem_msg, occ_data_um_msg, gt_dem_msg, gt_data_um_msg = msg
                time = occ_dem_msg["time"]
                h, w = occ_dem_msg["height"], occ_dem_msg["width"]

                occ_dem = np.array(occ_dem_msg["data"])
                occ_dem = occ_dem.reshape((-1, int(np.sqrt(occ_dem.shape[0]))),
                                          order="F")

                occ_data_um = np.array(occ_data_um_msg["data"])
                occ_data_um = occ_data_um.reshape(
                    (-1, int(np.sqrt(occ_data_um.shape[0]))), order="F")

                gt_dem = np.array(gt_dem_msg["data"])
                gt_dem = gt_dem.reshape((-1, int(np.sqrt(gt_dem.shape[0]))),
                                        order="F")

                gt_data_um = np.array(gt_data_um_msg["data"])
                gt_data_um = gt_data_um.reshape(
                    (-1, int(np.sqrt(gt_data_um.shape[0]))), order="F")

                res_grid = np.array([0.05, 0.05])
                rel_position_z = occ_dem[int(occ_dem.shape[0] // 2),
                                         int(occ_dem.shape[1] // 2)]
                rel_position = np.array([0, 0, rel_position_z])
                rel_attitude = Rotation.from_euler('zyx', [0, 0, 0]).as_quat()

                # self.visualize(sample_idx=sample_idx, res_grid=res_grid, rel_position=rel_position,
                #                occ_dem=occ_dem, gt_dem=gt_dem, occ_data_um=occ_data_um, gt_data_um=gt_data_um)

                target_size_x = self.config.get("size", occ_dem.shape[0])
                target_size_y = self.config.get("size", occ_dem.shape[1])
                num_subgrids_x = int(np.floor(occ_dem.shape[0] /
                                              target_size_x))
                num_subgrids_y = int(np.floor(occ_dem.shape[1] /
                                              target_size_y))

                assert num_subgrids_x >= 1 and num_subgrids_y >= 1

                if progress_bar is None:
                    # we extrapolate the total maximum number of samples
                    # by comparing the number of messages and size of the current chunk
                    # TODO: I am not sure if this code is correct (for multiple chunks)
                    file_size = os.path.getsize(
                        self.config["msgpack_path"])  # in bytes
                    self.total_num_samples = int(
                        len(occ_dem_msgs) / self.unpacker.tell() * file_size)
                    self.total_num_samples *= num_subgrids_x * num_subgrids_y  # multiply with the number of subgrids

                    progress_bar = Bar(
                        f"Processing msgspack from {self.config['msgpack_path']}",
                        max=self.total_num_samples)

                start_x = 0
                for i in range(num_subgrids_x):
                    stop_x = start_x + target_size_x
                    start_y = 0
                    for j in range(num_subgrids_y):
                        stop_y = start_y + target_size_y

                        occ_dem_subgrid = occ_dem[start_x:stop_x,
                                                  start_y:stop_y]
                        occ_data_um_subgrid = occ_data_um[start_x:stop_x,
                                                          start_y:stop_y]
                        gt_dem_subgrid = gt_dem[start_x:stop_x, start_y:stop_y]
                        gt_data_um_subgrid = gt_data_um[start_x:stop_x,
                                                        start_y:stop_y]

                        subgrid_delta_x = res_grid[0] * (
                            -occ_dem.shape[0] / 2 + start_x +
                            target_size_x / 2)
                        subgrid_delta_y = res_grid[1] * (
                            -occ_dem.shape[1] / 2 + start_y +
                            target_size_y / 2)
                        rel_position_subgrid_z = occ_dem_subgrid[
                            int(target_size_x // 2),
                            int(target_size_y // 2)]
                        rel_position_subgrid = np.array([
                            rel_position[0] + subgrid_delta_x,
                            rel_position[1] + subgrid_delta_y,
                            rel_position_subgrid_z
                        ])

                        if np.isnan(occ_dem_subgrid).all():
                            # we skip because the DEM only contains occlusion (NaNs)
                            start_y = stop_y
                            progress_bar.next()
                            continue

                        if np.isnan(gt_dem_subgrid).all():
                            # we skip because the DEM only contains missing values (NaNs)
                            pass
                            # start_y = stop_y
                            # progress_bar.next()
                            # continue

                        max_occ_ratio_thresh = self.config.get(
                            "max_occlusion_ratio_threshold", 0.5)
                        # we do not want to include the subgrid in the dataset if its occluded to more than 50%
                        if np.isnan(occ_dem_subgrid).sum() > (
                                target_size_x * target_size_y *
                                max_occ_ratio_thresh):
                            start_y = stop_y
                            progress_bar.next()
                            continue

                        if prior_occ_dem is not None:
                            # we compute MSE and PSNR between the current occluded dem and the occluded dem from the prior timestamp
                            prior_occ_dem_subgrid = prior_occ_dem[
                                start_x:stop_x, start_y:stop_y]

                            occ_dem_subgrid_no_nan = np.nan_to_num(
                                occ_dem_subgrid, copy=True, nan=0.0)
                            prior_occ_dem_subgrid_no_nan = np.nan_to_num(
                                prior_occ_dem_subgrid, copy=True, nan=0.0)

                            mse = mse_loss_fct(
                                input=torch.tensor(occ_dem_subgrid_no_nan),
                                target=torch.tensor(
                                    prior_occ_dem_subgrid_no_nan))

                            data_min = np.min([
                                occ_dem_subgrid_no_nan,
                                prior_occ_dem_subgrid_no_nan
                            ]).item()
                            data_max = np.max([
                                occ_dem_subgrid_no_nan,
                                prior_occ_dem_subgrid_no_nan
                            ]).item()
                            psnr = psnr_from_mse_loss_fct(mse=mse,
                                                          data_min=data_min,
                                                          data_max=data_max)

                            # we want to exclude dems which are too similar
                            if psnr > self.config.get(
                                    "psnr_similarity_threshold", 50):
                                start_y = stop_y
                                progress_bar.next()
                                continue

                        self.res_grid.append(res_grid)
                        self.rel_positions.append(rel_position_subgrid)
                        self.rel_attitudes.append(rel_attitude)
                        self.occ_dems.append(occ_dem_subgrid)
                        self.occ_data_ums.append(occ_data_um_subgrid)
                        self.gt_dems.append(gt_dem_subgrid)

                        if self.initialized_datasets is False:
                            super().create_base_datasets(
                                self.hdf5_group, self.total_num_samples)

                            self.hdf5_group.create_dataset(
                                name=ChannelEnum.OCC_DEM.value,
                                shape=(0, occ_dem_subgrid.shape[0],
                                       occ_dem_subgrid.shape[1]),
                                maxshape=(self.total_num_samples,
                                          occ_dem_subgrid.shape[0],
                                          occ_dem_subgrid.shape[1]))
                            self.hdf5_group.create_dataset(
                                name=ChannelEnum.OCC_DATA_UM.value,
                                shape=(0, occ_data_um_subgrid.shape[0],
                                       occ_data_um_subgrid.shape[1]),
                                maxshape=(self.total_num_samples,
                                          occ_data_um_subgrid.shape[0],
                                          occ_data_um_subgrid.shape[1]))
                            self.hdf5_group.create_dataset(
                                name=ChannelEnum.GT_DEM.value,
                                shape=(0, gt_dem_subgrid.shape[0],
                                       gt_dem_subgrid.shape[1]),
                                maxshape=(self.total_num_samples,
                                          gt_dem_subgrid.shape[0],
                                          gt_dem_subgrid.shape[1]))

                        if self.sample_idx % self.config.get(
                                "save_frequency", 50) == 0:
                            self.save_cache()

                        self.visualize(sample_idx=sample_idx,
                                       res_grid=res_grid,
                                       rel_position=rel_position_subgrid,
                                       occ_dem=occ_dem_subgrid,
                                       gt_dem=gt_dem_subgrid,
                                       occ_data_um=occ_data_um_subgrid,
                                       gt_data_um=gt_data_um_subgrid)

                        prior_occ_dem = occ_dem
                        sample_idx += 1
                        start_y = stop_y
                        progress_bar.next()

                    start_x = stop_x

        self.save_cache()
        progress_bar.finish()
def extract_spectrogram():
    """
    Extract raw sepectrograms for all segments (Not the masked spectrogram from Luscinia)
    :return:
    """
    audio_to_segs = {}
    for segment in Segment.objects.all():
        audio_file = segment.audio_file
        if audio_file not in audio_to_segs:
            audio_to_segs[audio_file] = [(segment.id, segment.start_time_ms, segment.end_time_ms)]
        else:
            audio_to_segs[audio_file].append((segment.id, segment.start_time_ms, segment.end_time_ms))

    n = len(audio_to_segs)
    bar = Bar('Exporting spects ...', max=n)

    for audio_file, seg_list in audio_to_segs.items():
        count = 0
        for seg_id, start, end in seg_list:
            seg_spect_path = spect_fft_path(seg_id, 'syllable')
            if os.path.isfile(seg_spect_path):
                count += 1
        if count == len(seg_list):
            bar.next()
            continue

        filepath = wav_path(audio_file)

        fs, sig = wav_2_mono(filepath)
        duration_ms = len(sig) * 1000 / fs

        _, _, s = signal.stft(sig, fs=fs, window=window,
                              noverlap=noverlap, nfft=window_size, return_onesided=True)
        file_spect = np.abs(s * scale)

        height, width = np.shape(file_spect)
        file_spect = np.flipud(file_spect)

        try:

            file_spect = np.log10(file_spect)
            file_spect = ((file_spect - global_min_spect_pixel) / interval64)
            file_spect[np.isinf(file_spect)] = 0
            file_spect = file_spect.astype(np.int)

            file_spect = file_spect.reshape((width * height,), order='C')
            file_spect[file_spect >= 64] = 63
            file_spect_rgb = np.empty((height, width, 3), dtype=np.uint8)
            file_spect_rgb[:, :, 0] = cm_red[file_spect].reshape(
                (height, width)) * 255
            file_spect_rgb[:, :, 1] = cm_green[file_spect].reshape(
                (height, width)) * 255
            file_spect_rgb[:, :, 2] = cm_blue[file_spect].reshape(
                (height, width)) * 255

            file_spect_img = Image.fromarray(file_spect_rgb)
            file_spect_path = spect_fft_path(audio_file.id, 'song')
            ensure_parent_folder_exists(file_spect_path)
            if not os.path.isfile(file_spect_path):
                file_spect_img.save(file_spect_path, format='PNG')

            for seg_id, start, end in seg_list:
                roi_start = int(start / duration_ms * width)
                roi_end = int(np.ceil(end / duration_ms * width))

                seg_spect_rgb = file_spect_rgb[:, roi_start:roi_end, :]
                seg_spect_img = Image.fromarray(seg_spect_rgb)
                seg_spect_path = spect_fft_path(seg_id, 'syllable')
                ensure_parent_folder_exists(seg_spect_path)

                if not os.path.isfile(seg_spect_path):
                    seg_spect_img.save(seg_spect_path, format='PNG')

        except Exception as e:
            warning('Error occured at song id: {}'.format(audio_file.id))
            raise e

        bar.next()
    bar.finish()
def verify_stats_cb():
    """ Verify stats_cb """

    def stats_cb(stats_json_str):
        global good_stats_cb_result
        stats_json = json.loads(stats_json_str)
        if topic in stats_json['topics']:
            app_offset = stats_json['topics'][topic]['partitions']['0']['app_offset']
            if app_offset > 0:
                print("# app_offset stats for topic %s partition 0: %d" %
                      (topic, app_offset))
                good_stats_cb_result = True

    conf = {'bootstrap.servers': bootstrap_servers,
            'group.id': uuid.uuid1(),
            'session.timeout.ms': 6000,
            'error_cb': error_cb,
            'stats_cb': stats_cb,
            'statistics.interval.ms': 200,
            'default.topic.config': {
                'auto.offset.reset': 'earliest'
            }}

    c = confluent_kafka.Consumer(**conf)
    c.subscribe([topic])

    max_msgcnt = 1000000
    bytecnt = 0
    msgcnt = 0

    print('Will now consume %d messages' % max_msgcnt)

    if with_progress:
        bar = Bar('Consuming', max=max_msgcnt,
                  suffix='%(index)d/%(max)d [%(eta_td)s]')
    else:
        bar = None

    while not good_stats_cb_result:
        # Consume until EOF or error

        msg = c.poll(timeout=20.0)
        if msg is None:
            raise Exception('Stalled at %d/%d message, no new messages for 20s' %
                            (msgcnt, max_msgcnt))

        if msg.error():
            if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF:
                # Reached EOF for a partition, ignore.
                continue
            else:
                raise confluent_kafka.KafkaException(msg.error())

        bytecnt += len(msg)
        msgcnt += 1

        if bar is not None and (msgcnt % 10000) == 0:
            bar.next(n=10000)

        if msgcnt == 1:
            t_first_msg = time.time()
        if msgcnt >= max_msgcnt:
            break

    if bar is not None:
        bar.finish()

    if msgcnt > 0:
        t_spent = time.time() - t_first_msg
        print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' %
              (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent,
               (bytecnt / t_spent) / (1024*1024)))

    print('closing consumer')
    c.close()
def import_syllables(conn):
    """
    :param conn: the database connection
    :return:
    """
    cur = conn.cursor()
    el_cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    # Import syllables for all songs
    cur.execute('SELECT sg.name, s.starttime, s.endtime, w.songid FROM syllable s '
                'JOIN wavs w ON s.songid=w.songid '
                'JOIN songdata sg ON w.songid=sg.id ORDER BY w.filename, s.starttime')
    song_syllable_rows = cur.fetchall()
    songs_2_syllables = {}

    # Song #79 PKI_2017_02_25_WHW028_01_M.EX..PipeClicksGrowlcough.wav has more than one syllable at position 1124:1136.
    # Db Syllable #2924

    for row in song_syllable_rows:
        song_name = row[0]
        syl_starttime = row[1]
        syl_endtime = row[2]
        song_id = row[3]

        el_cur.execute('select starttime, timelength from element where songid={} and starttime >= {} '
                       'and (starttime + timelength) <= {} order by starttime'.format(song_id,
                                                                                      syl_starttime,
                                                                                      syl_endtime))
        el_rows = el_cur.fetchall()
        if len(el_rows) == 0:
            warning('Syllable with starttime={} endtime={} of song: "{}" doesn\'t enclose any syllable.'
                    .format(syl_starttime, syl_endtime, song_name))
            continue

        real_syl_starttime = el_rows[0]['starttime']
        real_syl_endtime = utils.get_syllable_end_time(el_rows)

        syllable = (real_syl_starttime, real_syl_endtime)

        if song_name not in songs_2_syllables:
            syllables = []
            songs_2_syllables[song_name] = syllables
        syllables.append(syllable)

    # delete all existing manual segmentation:
    Segment.objects.filter(audio_file__name__in=songs_2_syllables.keys()).delete()

    bar = Bar('Importing syllables ...', max=len(songs_2_syllables))
    for song in songs_2_syllables:
        syllables = songs_2_syllables[song]
        audio_file = AudioFile.objects.filter(name=song).first()
        if audio_file is None:
            warning('File {} has not been imported. Please run import_luscinia_songs again.'
                    ' Ignore for now'.format(song))
            continue

        for syllable in syllables:
            segment = Segment()
            segment.start_time_ms = syllable[0]
            segment.end_time_ms = syllable[1]
            segment.audio_file = audio_file
            segment.save()
            segment.tid = segment.id
            segment.save()

        # print('Processed song {}'.format(song))
        bar.next()
    bar.finish()
def query_to_tables(query,
                    results_limit,
                    output_path,
                    result_order=None,
                    input_csv=None):
    """
    Takes in a search query as a sting,
    the amount of results you want returned as a string,
    the path you want to save to as a string,
    and optionally, the order of your results as a string.

    Generates a folder within the folder you specify and
    populates it with 4 spreadsheets containing the docket data
    from your search.
    """

    # We convert the amount of results the user wants to an integer so we can work with the number.
    if input_csv == None:
        results_limit = int(results_limit)

    def fill_docketInformation(result, docket):
        """
        This nested function populates the docketInformation dataframe.
        """
        if not 'info' in docket:
            return
        # We loop through all the keys present in the dockets info dictionary.
        for key in docket['info']:

            # We create the new row we want to add as a dictionary.
            # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in
            # case the key doesn't exist.
            new_docketInformation_row = {
                'Docket Number':
                result['docket'],
                'Court Name':
                result['court'],
                'Case Title':
                docket['info'].get('title', result.get("title", None)),
                'Case Info Field':
                key,
                'Case Info Values':
                docket['info'][key],
            }

            # We append the global dataframe with the row we want represented as a dictionary.
            # ignore_index=True specifies that we don't want to generate an index column.
            global docketInformation
            appender = docketInformation.append(new_docketInformation_row,
                                                ignore_index=True)

            # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated.
            # We replace the original with the new version so our changes get saved.
            docketInformation = appender

    def fill_docketEntries(result, docket):
        """
        This nested function populates the docketEntries dataframe.
        """

        # We loop through each dictionary within the docket_report list
        if not 'docket_report' in docket:
            print(docket)
            return

        for document in docket['docket_report']:

            # We create the new row we want to add as a dictionary.
            # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in
            # case the key doesn't exist.
            new_docketEntries_row = {
                'Docket Number':
                result['docket'],
                'Court Name':
                result['court'],
                'Case Title':
                docket['info'].get('title', result.get("title", None)),
                'Docket Entry Date':
                document.get('entry_date', None),
                'Docket Entry Numbers':
                document.get('number', None),
                'Docket Entry Contents':
                removehtml(document.get('contents', None)),
            }

            # We append the global dataframe with the row we want represented as a dictionary.
            # ignore_index=True specifies that we don't want to generate an index column.
            global docketEntries
            appender = docketEntries.append(new_docketEntries_row,
                                            ignore_index=True)

            # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated.
            # We replace the original with the new version so our changes get saved.
            docketEntries = appender

    def fill_parties(result, docket):
        """
        This nested function populates the parties dataframe.
        """

        # The parties key is not always present in our response.
        if not 'parties' in docket:
            # If it's not present, we don't add to the dataframe and we exit the function.
            print(docket)
            return

        for party in docket.get('parties', None):

            # We create the new row we want to add as a dictionary.
            # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in
            # case the key doesn't exist.
            new_parties_row = {
                'Docket Number':
                result.get('docket', None),
                'Court Name':
                result.get('court', None),
                'Case Title':
                docket['info'].get('title', result.get("title", None)),
                'Party Name':
                party.get('name_normalized', party.get('name')),
                'Party Type':
                party.get('type', None),
            }

            # We append the global dataframe with the row we want represented as a dictionary.
            # ignore_index=True specifies that we don't want to generate an index column.
            global parties
            appender = parties.append(new_parties_row, ignore_index=True)

            # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated.
            # We replace the original with the new version so our changes get saved.
            parties = appender

    def fill_attorneysAndFirms(result, docket):
        """
        This nested function populates the attorneysAndFirms dataframe.
        """

        # The parties key is not always present in our response.
        if not 'parties' in docket:
            # If it's not present, we don't add to the dataframe and we exit the function.
            return

        # We loop through each dictionary within the parties list of dictionaries.
        for party in docket['parties']:

            # The counsel key will not always be present in the dictionary.
            if not 'counsel' in party:
                # If it's not, we don't write to the dataframe and we exit the function.
                return
            for counsel in party['counsel']:

                # We create the new row we want to add as a dictionary.
                # Using .get() allows us to specify the key that we want, and specify a default value as the second argument in
                # case the key doesn't exist.
                new_attorneysAndFirms_row = {
                    'Docket Number': result.get('docket', None),
                    'Court Name': result.get('court', None),
                    'Attorney Name': counsel.get("name", None),
                    'Attorney Firm': counsel.get("firm", None),
                    'Attorney Email': counsel.get("email", None),
                    'Attorney Phone': counsel.get("phone", None),
                }

                # We append the global dataframe with the row we want represented as a dictionary.
                # ignore_index=True specifies that we don't want to generate an index column.
                global attorneysAndFirms
                appender = attorneysAndFirms.append(new_attorneysAndFirms_row,
                                                    ignore_index=True)

                # When we append a dataframe, the original is not changed, rather a new version of the dataframe with the added row is generated.
                # We replace the original with the new version so our changes get saved.
                attorneysAndFirms = appender

    if input_csv != None:
        # The path to the input spreadsheet is the path that the user specified in the main menu.

        # The path where the JSON files will be downloaded to is the path that the user specified in the main menu.
        JSON_INPUT_OUTPUT_PATH = global_variables.JSON_INPUT_OUTPUT_PATH

        # The client matter is the string that the user specified in the main menu.
        CLIENT_MATTER = global_variables.CLIENT_MATTER

        IS_CACHED = global_variables.IS_CACHED

        # This list starts out empty, gets a tuple appended to it with every iteration of the loop below, and will eventually
        # be the value returned by this function.
        output_list_of_tuples = []

        try:
            # We try to open the csv as a pandas dataframe. Pandas dataframes make working with tabular data in python faster and easier.
            df = pd.read_csv(input_csv)

        except Exception as e:
            # If there are any errors with opening the dataframe, we print the data to the console to alert the user.
            print(f"{e}")
            input()

        searchResults = []
        # We loop through every row of the input spreadsheet, the row value allows us to access each value in each row through indexing.
        searching_from_csv_bar = Bar("Reading CSV, Querying Docket Alarm...",
                                     max=df.shape[0])
        for index, row in df.iterrows():
            # We use indexing to store each value in the appropriate variables so they are more human-readable.
            caseName = row[0]
            caseNo = row[1]
            caseCourt = row[2]
            # We place the values into a tuple that will serve as parameters for download_json_from_list_of_tuples()
            # when we call it inside the thread_download_json() wrapper.

            query = f"is:docket court:({caseCourt}) docket:({caseNo})"
            user = login.Credentials()
            searchResult = user_tools.search_docket_alarm(
                (user.username, user.password),
                query,
                limit=1,
                result_order=result_order)
            searchResults += searchResult
            searching_from_csv_bar.next()
        searching_from_csv_bar.finish()

    else:

        # After defining all of our nested functions, this is where the query_to_tables() function begins.

        # First we let the user know to wait, so they don't press any buttons that get entered as the input they will be prompted for when this is done loading.
        print("\n")
        print("Querying, please wait...")
        # We create our user object to log in. We can use attributes and methods to access the username, password, and authentication token of our currently signed in user.
        user = login.Credentials()

        # We run our search, using the query, the number of results, and the order that the user specified in the menu.
        searchResults = user_tools.search_docket_alarm(
            (user.username, user.password),
            query,
            limit=results_limit,
            result_order=result_order)

        searchResults = searchResults[0:results_limit]

    # We let the user know how many results were returned for their search and ask them to confirm to proceed.
    print(
        f"\nThis search query resulted in {len(searchResults)} results. Proceed? [Y/n]"
    )

    # We store their answer in a variable.
    user_proceed_choice = input()

    # If the user says no...
    if user_proceed_choice.lower() == "n":
        # We do not proceed. The user is returned to the menu.
        menus.spreadsheet_generator_menu()
    # If answers something other than y or n (yes or no)...
    elif user_proceed_choice.lower() != "y" and user_proceed_choice.lower(
    ) != "n":
        # We let them know their response was invalid...
        print("Invalid response. Returning to menu.")
        # We pause the script until they press enter, so we know they're aware of whats happening...
        input()
        # And we return them to the menu.
        menus.spreadsheet_generator_menu()
    # If the user answers Y (yes), then the script continues.
    menus.clear()

    # We clear the menu and display ascii art in red.
    print(Fore.RED + menus.msg2)

    # We are about to initialize our progress bar. When we do this, we need to specify the maximum number of loops that the
    # progress bar is tracking. This gets passed as an argument.
    progressbar_maximum = len(searchResults)

    # We initialize our progress bar, specifying the text that will be displayed alongside the bar, and the maximum amount of loops
    # the bar will track.
    bar = Bar('Generating CSVs', max=progressbar_maximum)

    # The search results that are returned are a list of dictionaries. We begin to iterate through them.
    for result in searchResults:

        # We use the get_docket() function to return the docket data for every result in our search query.
        # To pull the docket, we specify the docket number and the court. We specify if the data is cached or uncached, and what the client matter is.
        docket = user_tools.get_docket(
            user.authenticate(),
            result['docket'],
            result['court'],
            cached=global_variables.IS_CACHED,
            client_matter=global_variables.CLIENT_MATTER)

        # through every iteration over our results, we pass the result data, and the docket data for each result to each of the
        # nested functions we defined at the beginning of this funciton. The dataframes that are declared as global variables at the
        # top of this module are appended with new data with each iteration.
        fill_docketInformation(result, docket)
        fill_docketEntries(result, docket)
        fill_parties(result, docket)
        fill_attorneysAndFirms(result, docket)

        # With each iteration, we move our progress bar forward until it hits its maximum.
        bar.next()

    # We get the current date and time to use in the name of the output folder we will generate. This helps us generate
    # unique folder names each time we run the script.
    timeNow = datetime.datetime.now().strftime("%I%M%p %B %d %Y")

    # The complete name of the folder will be the search entered, followed by the current time.
    # We use the cleanhtml function to remove any characters that are not allowed in file or folder names.
    # cleanhtml() is imported from get_pdfs.py.
    if input_csv == None:
        containing_folder_name = f"{cleanhtml(query)} - {timeNow}"
    else:
        containing_folder_name = f"{timeNow}"

    # We put together the absolute path to the folder we want to create and populate it.
    output_directory = os.path.join(output_path, containing_folder_name)

    # We check to see if the folder already exists...
    if not os.path.exists(output_directory):
        # If it doesn't, we create it.
        os.makedirs(output_directory)

    # We create strings for the absolute paths to each individual csv file we will be creating, with the .csv extension included.
    docketInformation_outputFile = os.path.join(output_directory,
                                                "docketInformation.csv")
    docketEntries_outputFile = os.path.join(output_directory,
                                            "docketEntries.csv")
    parties_outputFile = os.path.join(output_directory, "parties.csv")
    attorneysAndFirms_outputFile = os.path.join(output_directory,
                                                "attorneysAndFirms.csv")

    # We use the .to_csv() method on our dataframe object to save the filled out dataframes to csv files at the paths we specified above.
    # index=False specifies that we do not want to generate a numerical index column.
    docketInformation.to_csv(docketInformation_outputFile, index=False)
    docketEntries.to_csv(docketEntries_outputFile, index=False)
    parties.to_csv(parties_outputFile, index=False)
    attorneysAndFirms.to_csv(attorneysAndFirms_outputFile, index=False)

    # We set the progress bar to it's completed state.
    bar.finish()
Beispiel #51
0
def formatOTUtableData(OTU_table, max_level=14, tax_reassign_list=[]):
    '''This script reads in and formats an imported raw ASV table by adding \
        taxonomy data.

    Parameters
    ----------
    OTU_table : pandas.DataFrame
        This is the raw imported OTU (or ASV) table with
            index:  OTU IDs
            header: a list of sample names followed by 'taxonomy' at the end
    max_level : int (optional)
        This is the maximum taxonomic level present in the dataset. \
            The default is 14 (i.e., 'D_14__').
    tax_reassign_list : dict (optional)
        List of taxonomic names in the dataset with the values they should \
            be reassigned. The default is none.

    Returns
    -------
    data : pandas.DataFrame
        Formatted data as a DataFrame.
        New headers include a full taxonomic breakdown
    samples : list
        List of samples in the dataset.

    '''
    OTU_table = OTU_table.copy()
    # Get sample list
    samples = list(OTU_table.columns)[0:-1]

    # Reclassify any values with assignments in the tax_reassign_list
    if tax_reassign_list:
        for val in list(tax_reassign_list):
            OTU_table.loc[OTU_table['taxonomy'] == val,
                          'taxonomy'] = tax_reassign_list[val]

    # Format taxonomy list to read better
    print('Formatting taxonomy...')
    for i in np.arange(max_level + 1):
        delstr = 'D_' + str(i) + '__'
        OTU_table['taxonomy'] = OTU_table['taxonomy'].str.replace(delstr, '')

    # Break taxmap into levels
    taxlist = getUnique(OTU_table['taxonomy'])
    bar = Bar('', max=len(taxlist))
    for value in taxlist:
        splitlist = [value]

        # Get list of levels
        if '; __' in value:
            splitlist = value.split('; __')
        elif '; ' in value:
            splitlist = value.split('; ')
        # Fix last level if needed
        if splitlist[-1]:
            if splitlist[-1][-1] == ';':
                splitlist[-1] = splitlist[-1][0:-1]
        else:
            splitlist = splitlist[0:-1]

        for L in range(1, min(len(splitlist) + 1, max_level + 1)):
            OTU_table.loc[OTU_table['taxonomy'] == value,
                          'L' + str(L)] = splitlist[L - 1]

        bar.next()
    bar.finish()

    # Get rid of nans in taxonomy levels
    end_level = len(OTU_table.columns) - len(samples) - 1
    cols = levelCols(end_level)
    OTU_table[cols] = OTU_table[cols].replace(np.nan, '')

    # Convert values to float
    OTU_table[samples] = OTU_table[samples].astype(float)

    return OTU_table, samples
Beispiel #52
0
def train(train_loader, model, criterion, optimizer, epoch, use_cuda):
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    bar = Bar('Processing', max=len(train_loader))
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if use_cuda:
            inputs, targets = inputs.cuda(), targets.cuda()
        inputs, targets = torch.autograd.Variable(
            inputs), torch.autograd.Variable(targets)

        # compute output
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))

        # compute gradient
        optimizer.zero_grad()
        if args.half:
            with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            # with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
            #     scaled_loss.backward()
        else:
            loss.backward()
        # do SGD step
        optimizer.step()

        if not args.linear_quantization:
            kmeans_update_model(model,
                                quantizable_idx,
                                centroid_label_dict,
                                free_high_bit=args.free_high_bit)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # plot progress
        if batch_idx % 1 == 0:
            bar.suffix = \
                '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | ' \
                'Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
                    batch=batch_idx + 1,
                    size=len(train_loader),
                    data=data_time.val,
                    bt=batch_time.val,
                    total=bar.elapsed_td,
                    eta=bar.eta_td,
                    loss=losses.avg,
                    top1=top1.avg,
                    top5=top5.avg,
                )
            bar.next()
    bar.finish()
    return losses.avg, top1.avg
#!/usr/bin/python
# encoding: utf-8
# -*- coding: utf8 -*-
"""
Created by PyCharm.
File:               LinuxBashShellScriptForOps:progressOps.py
User:               Guodong
Create Date:        2016/12/7
Create Time:        0:13
 """
# https://pypi.python.org/pypi/progress/1.2
# pip used
from progress.bar import Bar
import time

bar = Bar('Processing', max=20)
for i in range(20):
    # Do some work
    time.sleep(1)
    bar.next()
bar.finish()
Beispiel #54
0
def fastphot(SC_MAP, PSF_MAP, NOISE_MAP, Catalog, nb_process=4):
    """
    Return flux of sources associated to given positions 

    Parameters
    ----------
    SC_MAP : numpy masked array.
        The SCientific MAP.
    PSF_MAP : numpy array
        The Point Spread Function MAP.
    NOISE_MAP : numpy masked array
        The Signal/Noise MAP.
    Catalog : numpy scrutured and masked array
        The source catalog.
        It must contain at least in input the source positions
        The Phot function allows to complete it by saving source fluxes
    nb_process : integer
        number of independant cpu(s) used to build A matrix and B vector
        by default we assume nb_process = 4
        
    Returns
    -------
    RESIDUAL_MAP : numpy masked array
        The residual map (SC_MAP - MODEL_MAP)
    bkg : float
        The background level
    """
    #
    print('> PHOT')
    #
    # extract some information about maps and sources
    SC_MAP_npix_x, SC_MAP_npix_y = SC_MAP.shape
    PSF_MAP_npix_x, PSF_MAP_npix_y = PSF_MAP.shape
    #
    # Compress the input catalog to remove masked sources
    N_src = len(npy.ma.compressed(Catalog['ID']))
    #
    # SC_MAP and NOISE_MAP have to be imersed in a "full" MAP
    # taking into acount an half PSF-size on the edges
    edge_x = int(math.floor(PSF_MAP_npix_x / 2))
    edge_y = int(math.floor(PSF_MAP_npix_y / 2))
    x_i = edge_x
    x_f = x_i + SC_MAP_npix_x
    y_i = edge_y
    y_f = y_i + SC_MAP_npix_y
    # SC_MAP
    SC_full_MAP = npy.zeros(
        [SC_MAP_npix_x + 2 * edge_x, SC_MAP_npix_y + 2 * edge_y])  # create
    SC_full_MAP[x_i:x_f, y_i:y_f] = SC_MAP  # imerse
    # NOISE_MAP
    NOISE_full_MAP = npy.zeros(
        [SC_MAP_npix_x + 2 * edge_x, SC_MAP_npix_y + 2 * edge_y])  # create
    NOISE_full_MAP[x_i:x_f, y_i:y_f] = NOISE_MAP  # imerse
    #
    # Create the mask
    MASK = (NOISE_full_MAP <= 0.e0)
    #
    # Convert SC_MAP and NOISE_MAP in masked array
    SC_full_MAP = npy.ma.array(SC_full_MAP, mask=MASK)
    NOISE_full_MAP = npy.ma.array(NOISE_full_MAP, mask=MASK)
    #
    # Init B and F vectors and A matrix
    B = npy.zeros(N_src + 1)
    A = npy.zeros([N_src + 1, N_src + 1])
    F = npy.zeros(N_src + 1)
    #
    # Build Vectors and Matrix
    t_start = time()
    print(' > Build Vectors and Matrix')
    pool = mp.Pool(processes=nb_process)
    # (i, Bi, Ai_, A_)
    X_pos = npy.ma.compressed(Catalog['x_pos'])
    Y_pos = npy.ma.compressed(Catalog['y_pos'])
    R = [
        pool.apply_async(Coef_i,
                         args=(SC_full_MAP, NOISE_full_MAP, PSF_MAP, X_pos,
                               Y_pos, si)) for si in range(N_src)
    ]
    # Reformat result, build A and B
    bar = Bar(' >', max=N_src)
    for ri in R:
        bar.next()
        r_i = ri.get()
        B[r_i[0]] = r_i[1]
        A[r_i[0], r_i[0]:N_src] = r_i[2]
        A[r_i[0]:N_src, r_i[0]] = r_i[2]
        A[r_i[0]][N_src] = r_i[3]
        A[N_src][r_i[0]] = r_i[3]
    bar.finish()
    # Complete
    B[N_src] = npy.nansum(SC_full_MAP / NOISE_full_MAP**2.)
    A[N_src][N_src] = npy.nansum(NOISE_full_MAP**(-2.))
    #
    # Solve system
    print(' > Solve system')
    F = npy.linalg.solve(A, B)
    print(' > Compute uncertainties')
    dF = npy.diag(npy.linalg.inv(A[:N_src, :N_src]))
    t_end = time()
    #
    # Update FLux field in the catalog
    Catalog['flux'][~Catalog['ID'].mask] = F[:N_src] - npy.ones(len(
        F[:N_src])) * F[N_src]
    Catalog['dflux'][~Catalog['ID'].mask] = npy.sqrt(dF)
    #
    # Build residual MAP
    print(' > Build Residual Map')
    RESIDUAL_MAP = SC_MAP - model_MAP(SC_MAP, PSF_MAP, Catalog)
    #
    analysis_time = t_end - t_start
    m = int(math.floor(analysis_time / 60.))
    s = analysis_time - m * 60
    print(
        ' > %4.4i source(s) analysed in %3.3i min %3.1f sec [%5.3f sec / src]'
        % (N_src, m, s, analysis_time / float(N_src)))
    print('> DONE')
    return Catalog, F[N_src], RESIDUAL_MAP
Beispiel #55
0
    def run_epoch(self, phase, epoch, data_loader):
        model_with_loss = self.model_with_loss
        if phase == 'train':
            model_with_loss.train()
        else:
            if len(self.opt.gpus) > 1:
                model_with_loss = self.model_with_loss.module
            model_with_loss.eval()
            torch.cuda.empty_cache()

        opt = self.opt
        results = {}
        data_time, batch_time = AverageMeter(), AverageMeter()
        avg_loss_stats = {l: AverageMeter() for l in self.loss_stats}
        num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters
        bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters)
        end = time.time()
        for iter_id, batch in enumerate(data_loader):
            if iter_id >= num_iters:
                break
            data_time.update(time.time() - end)

            for k in batch:
                if k != 'meta':
                    batch[k] = batch[k].to(device=opt.device,
                                           non_blocking=True)
            output, loss, loss_stats = model_with_loss(batch)
            loss = loss.mean()
            # print("orignal code loss is: ", loss)
            if phase == 'train':
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            batch_time.update(time.time() - end)
            end = time.time()

            Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format(
                epoch,
                iter_id,
                num_iters,
                phase=phase,
                total=bar.elapsed_td,
                eta=bar.eta_td)
            for l in avg_loss_stats:
                avg_loss_stats[l].update(loss_stats[l].mean().item(),
                                         batch['input'].size(0))
                Bar.suffix = Bar.suffix + '|{} {:.4f} '.format(
                    l, avg_loss_stats[l].avg)
            if not opt.hide_data_time:
                Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \
                  '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time)
            if opt.print_iter > 0:
                if iter_id % opt.print_iter == 0:
                    print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix))
            else:
                bar.next()

            if opt.debug > 0:
                self.debug(batch, output, iter_id)

            if opt.test:
                self.save_result(output, batch, results)
            del output, loss, loss_stats

        bar.finish()
        ret = {k: v.avg for k, v in avg_loss_stats.items()}
        ret['time'] = bar.elapsed_td.total_seconds() / 60.
        return ret, results
def SVDC_heatmap_generatorv1(df,
                             period_of_interest,
                             prediction_year=2012,
                             epidemic_classification_dict=None,
                             training_year_window='ALL',
                             t0_vector=None,
                             p_vector=None,
                             classifier='SVM',
                             modes=[0],
                             verbose=False):
    '''
    - p_max, p_min: sets the bounds for the period length vector
    - period_of_interest = () #initial and final date that contains the period of interest (poi).
    the period of interest defines the starting and finishing dates for the SVD classifierself.
    e.g. If poi is 01-02-YYYY through 28-02-YYYY, SVD classifier's heatmap will start on 28-02 of previous year and end
    on 01-02 of the next year
    -prediction_year
    -epidemic_classification_dict = dictionary. e.g. {'2001':1, '2002':0, '2003':1}
    '''

    #Generate grid based on p and t0 vectors
    distance_grid = np.zeros([len(p_vector), len(t0_vector)])

    years = []
    for i in range(df.index.shape[0]):
        years.append(df.index[i].year)
    years = sorted(list(set(years)))

    years_before_prediction = years.index(prediction_year)

    if training_year_window == 'ALL':
        training_years = years[0:years_before_prediction]
        n_years = years_before_prediction
    elif training_year_window < years_before_prediction:
        training_years = years[years_before_prediction -
                               training_year_window:years_before_prediction]
        n_years = training_year_window
    else:
        print(
            "Can't retrieve training window: {0}. Place make sure training window is 'ALL' or an int number within the number of years size"
            .format(training_year_window))

    if verbose:
        print('{0} years detected within dataframe: {1}.'.format(
            len(years), years))
        print('{0} Years before prediction: {1}'.format(
            n_years, training_years))

    # check if t0 dates are within
    dates_within_poi = []
    for d in t0_vector:
        if '{0}'.format(prediction_year) + d[4:] in df[
                period_of_interest[0]:period_of_interest[1]].index:
            dates_within_poi.append(d)

    if len(d) > 0:
        print(
            '{0} dates from t0_vector are inside period_of_interest range: {1}'
            .format(len(dates_within_poi), dates_within_poi))

    #Enter main loop
    print('Initiating heatmap loop.')
    bar = Bar('Processing', max=len(p_vector))
    for i, p in enumerate(p_vector):
        bar.next()
        for j, t0 in enumerate(t0_vector):

            if verbose: print('Reshaping data')
            X = SVDC_reshape_yearly_data_stolerman(df=df, t0=t0, p=p,\
                                                   years=training_years, \
                                                   upper_bound=period_of_interest[0],\
                                                   normalize=True, verbose=False)

            if verbose: print('Reshaping data done')
            '''
            Each column of X represents one year of data in the order of years_before_prediction. If we want out classification at year Y
            we need Y-1 as out of sample input and Y-2, Y-3...1 as our training dataset. As we're trying to classify every Y with previous year data, we also assign
            the epidemic classification of year Y to the label for Y-1
            '''
            if X is not None:

                X_train = X[:, :-1]
                X_predict = X[:, -1]
                Y_train = []
                for year in training_years[:
                                           -1]:  # Can take out of loop but keeping for clear reading purposes
                    Y_train.append(epidemic_classification_dict[year + 1])

                Y_train = np.vstack(Y_train)
                Y_predict = epidemic_classification_dict[prediction_year]

                # Perform svd
                U, sigma, VT = svd(X_train,
                                   n_components=3,
                                   n_iter=15,
                                   random_state=None)
                projections = sigma.reshape([-1, 1]) * VT
                projections = projections.T
                projections = projections[:, modes]
                '''
                Now that we got our projections from SVD we can create the classifier
                '''
                mod = svm.SVC(kernel='rbf',
                              gamma=1,
                              C=1,
                              cache_size=400,
                              max_iter=100000)
                if verbose:
                    ('Fitting with projections shape {0} and target shape {1}'.
                     format(projections.shape, Y_predict))
                mod.fit(projections, Y_train.ravel())
                pred = mod.predict(
                    np.matmul(X_predict.reshape([1, -1]), U[:, modes]))

                distance_grid[i, j] = (pred == Y_predict)
            else:
                distance_grid[i, j] = -1
    bar.finish()
    return distance_grid
Beispiel #57
0
        # compute for next stop
        herbie.recompute(mgr.city)
        herbie.plutocracy()
        if mgr.verbose: print()

    # first loop only
    if herbie.director is False:
        herbie.recompute(mgr.city)
        herbie.plutocracy()
        if not mgr.auto:
            mode = input("enter 'auto' to disable prompts: ")
            mgr.auto = True if mode == 'auto' else False

    # status
    if mgr.verbose:
        herbie.status()
        print()

    # travel in time and relative dimensions in space
    if len(herbie.requests) > 0:
        mgr.advance(herbie.requests)
        herbie.move(mgr.city, mgr.step)
        if not mgr.auto:
            nt = input("next turn? [enter 'auto' to disable prompts] ")
            mgr.auto = True if nt == 'auto' else False

if not mgr.verbose: bar.finish()
print('FINISHED! OFF DUTY!\n')
herbie.queue(herbie.complete)
### END
def validation(model, val_loader, epoch, writer):
    # set evaluate mode
    model.eval()

    total_correct, total_label = 0, 0
    total_correct_hb, total_label_hb = 0, 0
    total_correct_fb, total_label_fb = 0, 0
    hist = np.zeros((args.num_classes, args.num_classes))
    hist_hb = np.zeros((args.hbody_cls, args.hbody_cls))
    hist_fb = np.zeros((args.fbody_cls, args.fbody_cls))

    # Iterate over data.
    bar = Bar('Processing {}'.format('val'), max=len(val_loader))
    bar.check_tty = False
    for idx, batch in enumerate(val_loader):
        image, target, hlabel, flabel, _ = batch
        image, target, hlabel, flabel = image.cuda(), target.cuda(
        ), hlabel.cuda(), flabel.cuda()
        with torch.no_grad():
            h, w = target.size(1), target.size(2)
            outputs = model(image)
            outputs = gather(outputs, 0, dim=0)
            preds = F.interpolate(input=outputs[0][-1],
                                  size=(h, w),
                                  mode='bilinear',
                                  align_corners=True)
            preds_hb = F.interpolate(input=outputs[1][-1],
                                     size=(h, w),
                                     mode='bilinear',
                                     align_corners=True)
            preds_fb = F.interpolate(input=outputs[2][-1],
                                     size=(h, w),
                                     mode='bilinear',
                                     align_corners=True)
            if idx % 50 == 0:
                img_vis = inv_preprocess(image, num_images=args.save_num)
                label_vis = decode_predictions(target.int(),
                                               num_images=args.save_num,
                                               num_classes=args.num_classes)
                pred_vis = decode_predictions(torch.argmax(preds, dim=1),
                                              num_images=args.save_num,
                                              num_classes=args.num_classes)

                # visual grids
                img_grid = torchvision.utils.make_grid(
                    torch.from_numpy(img_vis.transpose(0, 3, 1, 2)))
                label_grid = torchvision.utils.make_grid(
                    torch.from_numpy(label_vis.transpose(0, 3, 1, 2)))
                pred_grid = torchvision.utils.make_grid(
                    torch.from_numpy(pred_vis.transpose(0, 3, 1, 2)))
                writer.add_image('val_images', img_grid,
                                 epoch * len(val_loader) + idx + 1)
                writer.add_image('val_labels', label_grid,
                                 epoch * len(val_loader) + idx + 1)
                writer.add_image('val_preds', pred_grid,
                                 epoch * len(val_loader) + idx + 1)

            # pixelAcc
            correct, labeled = batch_pix_accuracy(preds.data, target)
            correct_hb, labeled_hb = batch_pix_accuracy(preds_hb.data, hlabel)
            correct_fb, labeled_fb = batch_pix_accuracy(preds_fb.data, flabel)
            # mIoU
            hist += fast_hist(preds, target, args.num_classes)
            hist_hb += fast_hist(preds_hb, hlabel, args.hbody_cls)
            hist_fb += fast_hist(preds_fb, flabel, args.fbody_cls)

            total_correct += correct
            total_correct_hb += correct_hb
            total_correct_fb += correct_fb
            total_label += labeled
            total_label_hb += labeled_hb
            total_label_fb += labeled_fb
            pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label)
            IoU = round(np.nanmean(per_class_iu(hist)) * 100, 2)
            pixAcc_hb = 1.0 * total_correct_hb / (np.spacing(1) +
                                                  total_label_hb)
            IoU_hb = round(np.nanmean(per_class_iu(hist_hb)) * 100, 2)
            pixAcc_fb = 1.0 * total_correct_fb / (np.spacing(1) +
                                                  total_label_fb)
            IoU_fb = round(np.nanmean(per_class_iu(hist_fb)) * 100, 2)
            # plot progress
            bar.suffix = '{} / {} | pixAcc: {pixAcc:.4f}, mIoU: {IoU:.4f} |' \
                         'pixAcc_hb: {pixAcc_hb:.4f}, mIoU_hb: {IoU_hb:.4f} |' \
                         'pixAcc_fb: {pixAcc_fb:.4f}, mIoU_fb: {IoU_fb:.4f}'.format(idx + 1, len(val_loader),
                                                                                    pixAcc=pixAcc, IoU=IoU,
                                                                                    pixAcc_hb=pixAcc_hb, IoU_hb=IoU_hb,
                                                                                    pixAcc_fb=pixAcc_fb, IoU_fb=IoU_fb)
            bar.next()

    print('\n per class iou part: {}'.format(per_class_iu(hist) * 100))
    print('per class iou hb: {}'.format(per_class_iu(hist_hb) * 100))
    print('per class iou fb: {}'.format(per_class_iu(hist_fb) * 100))

    mIoU = round(np.nanmean(per_class_iu(hist)) * 100, 2)
    mIoU_hb = round(np.nanmean(per_class_iu(hist_hb)) * 100, 2)
    mIoU_fb = round(np.nanmean(per_class_iu(hist_fb)) * 100, 2)

    writer.add_scalar('val_pixAcc', pixAcc, epoch)
    writer.add_scalar('val_mIoU', mIoU, epoch)
    writer.add_scalar('val_pixAcc_hb', pixAcc_hb, epoch)
    writer.add_scalar('val_mIoU_hb', mIoU_hb, epoch)
    writer.add_scalar('val_pixAcc_fb', pixAcc_fb, epoch)
    writer.add_scalar('val_mIoU_fb', mIoU_fb, epoch)
    bar.finish()

    return pixAcc, mIoU