def __exit__(self, *args): self.end = self.timer() if self.disable_gc and self.gc_state: gc.enable() self.interval = self.end - self.start if self.verbose: print('time taken: %f seconds' % self.interval)
def test(self, view): """ Calls the given view and measures the time for it to return. The garbage collector is diabled during execution. """ gc_old = gc.isenabled() gc.disable() try: start = timeit.default_timer() if view.method == 'GET': response = self.client.get(view.url, view.data) elif view.method == 'POST': response = self.client.post(view.url, view.data) else: raise ValueError('Unknown view method: %s' % view.method) end = timeit.default_timer() # Return result in milliseconds time_ms = (end - start) * 1000 # Try to get version information version = subprocess.check_output(['git', 'describe']) from .models import TestResult return TestResult(view=view, time=time_ms, result=response, result_code=response.status_code, version=version) finally: if gc_old: gc.enable()
def getlines(a_filename): # it gives chunks fin = None if a_filename == '-': fin = sys.stdin else: fin = open(a_filename,'r') header = dict() first = True while True: lines = fin.readlines(10**8) if not lines: break gc.disable() lines = [line.rstrip('\r\n').split('\t') for line in lines if line.rstrip('\r\n')] gc.enable() for line in lines: if line[0].startswith('@'): if line[0].startswith('@SQ') and line[1].startswith('SN:') and line[2].startswith('LN:'): k = line[1][3:] v = int(line[2][3:]) header[k] = v else: pass else: if first: first = False yield header header = None yield line if first and header: yield header fin.close()
def __exit__(self, exc_type, exc_value, tb): gc.collect() new_objects = len(gc.get_objects()) if new_objects > self.old_objects: pytest.fail('Example code leaked') _gc_lock.release() gc.enable()
def test_trashcan(self): class Ouch: n = 0 def __del__(self): Ouch.n = Ouch.n + 1 if Ouch.n % 17 == 0: gc.collect() # "trashcan" is a hack to prevent stack overflow when deallocating # very deeply nested tuples etc. It works in part by abusing the # type pointer and refcount fields, and that can yield horrible # problems when gc tries to traverse the structures. # If this test fails (as it does in 2.0, 2.1 and 2.2), it will # most likely die via segfault. # Note: In 2.3 the possibility for compiling without cyclic gc was # removed, and that in turn allows the trashcan mechanism to work # via much simpler means (e.g., it never abuses the type pointer or # refcount fields anymore). Since it's much less likely to cause a # problem now, the various constants in this expensive (we force a lot # of full collections) test are cut back from the 2.2 version. gc.enable() N = 150 for count in range(2): t = [] for i in range(N): t = [t, Ouch()] u = [] for i in range(N): u = [u, Ouch()] v = {} for i in range(N): v = {1: v, 2: Ouch()} gc.disable()
def load(self): try: env = Environment.Environment(os.path.join(self.cachedir, "build.config.py")) except (IOError, OSError): pass else: if env["version"] < HEXVERSION: raise Utils.WafError("Version mismatch! reconfigure the project") for t in env["tools"]: self.setup(**t) try: gc.disable() f = data = None Node.Nodu = self.node_class try: f = open(os.path.join(self.bdir, DBFILE), "rb") except (IOError, EOFError): pass try: if f: data = cPickle.load(f) except AttributeError: if Logs.verbose > 1: raise if data: for x in SAVED_ATTRS: setattr(self, x, data[x]) else: debug("build: Build cache loading failed") finally: if f: f.close() gc.enable()
def main_measure(data, dataname,agentClassGenerator,params): X_POINTS = params[scp.X_POINTS] STEP = params[scp.STEP] NUM_FOLDS = params[scp.NUM_FOLDS] CLASSIFY_TIME = params[scp.CLASSIFY_TIME] LEARN_TIME = params[scp.LEARN_TIME] SEED = params[scp.SEED] num_features_arr = [ i*STEP for i in range(1,X_POINTS +1) ] print "\n============= Learning Curve ===================" print "Evaluating", dataname print "num_features,", "accuracy%," results=[] for num_features in num_features_arr: agentClass = agentClassGenerator(num_features) try: gc.disable() confusion = AgentAnalyzer().run_one(data, agentClass, CLASSIFY_TIME, LEARN_TIME, num_folds=NUM_FOLDS, seed=SEED) gc.enable() gc.collect() idf = s_common.idf(NUM_FOLDS, data, num_features) results.append( (num_features,confusion, idf) ) print num_features, ',', confusion.getAccuracyStr() except Exception, e: print "Error:",e print " Possible Timeout for", num_features
def scrape_links(delay=1, savelimit=100): gc.enable() movies, full_movies, count = load_json('links.json'), [], 1 for movie in movies: if count < 4601: print count count += 1 continue time.sleep(delay) full_data = scrape_movie_page(movie['link']) movie.update(full_data) full_movies.append(movie) print count, movie['name'], movie['year'], movie['revenue'] count += 1 if count % savelimit == 0: rank = [str(count-savelimit), '-', str(count)] path = 'data/movies' + ' '.join(rank) + '.json' save_json(full_movies, path) full_movies = [] gc.collect() print '%s ranked movies saved' % ' '.join(rank)
def _exitfunc(cls): # At shutdown invoke finalizers for which atexit is true. # This is called once all other non-daemonic threads have been # joined. reenable_gc = False try: if cls._registry: import gc if gc.isenabled(): reenable_gc = True gc.disable() pending = None while True: if pending is None or finalize._dirty: pending = cls._select_for_exit() finalize._dirty = False if not pending: break f = pending.pop() try: # gc is disabled, so (assuming no daemonic # threads) the following is the only line in # this function which might trigger creation # of a new finalizer f() except Exception: sys.excepthook(*sys.exc_info()) assert f not in cls._registry finally: # prevent any more finalizers from executing during shutdown finalize._shutdown = True if reenable_gc: gc.enable()
def main(): from optparse import OptionParser import Zope gc.enable() app = Zope.app() parser = OptionParser() parser.add_option('-u', '--user', dest='username', default='admin') parser.add_option('-p', '--path', dest='path', default='') parser.add_option('-o', '--output', dest='output', default='') parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False) parser.add_option('-i', '--ignore', dest='ignored_types', action='store', default=IGNORED_TYPES, help="Provide comma separated List of Portal Types " "to ignore") parser.add_option('-b', '--batch_size', dest='batch_size', default=0) parser.add_option('-s', '--batch_start', dest='batch_start', default=0) options, args = parser.parse_args() options.app = app if isinstance(options.ignored_types, basestring): options.ignored_types = options.ignored_types.split(',') options.batch_start = int(options.batch_start) options.batch_size = int(options.batch_size) export_site(app, options) transaction.commit()
def testNoReferenceCyclesAfterCall(self): class ChildNetwork(network.Network): def __init__(self, name=None): super(ChildNetwork, self).__init__(name=name) def call(self, x): return x * 2. class ParentNetwork(network.Network): def __init__(self, name=None): super(ParentNetwork, self).__init__(name=name) self.l1 = self.track_layer(ChildNetwork()) def call(self, x): return self.l1(x) one = constant_op.constant([[1.0]]) gc.disable() gc.collect() previous_gc_debug_flags = gc.get_debug() gc.set_debug(gc.DEBUG_SAVEALL) preexisting = len(gc.garbage) net = ParentNetwork() net(one) del net gc.collect() # There should be no additional garbage requiring collection. self.assertEqual(preexisting, len(gc.garbage)) gc.set_debug(previous_gc_debug_flags) gc.enable()
def get_data(n, s, e, metadata=False): gc.enable() # turn on garbage collection pop, comments = tools.date_range_sample(n, s, e) print 'Loaded %d comments' % pop print 'Random sample of %d from date range' % n gc.collect() print 'Garbage Collection complete' features, metafeatures, labels, = [], [], [] for c in comments: text = c['commentBody'] # text is the feature data features.append(text.encode('ascii','ignore')) labels.append(discretize_r(c['recommendationCount'])) if metadata: c_sec = s_codes[c['section']] c_wc = discretize(int(c['wordcount']), wrdcnt) c_rnk = discretize(int(c['timeRank']), trnk) c_elp = discretize(int(c['elapsedTime']), eTime) c_pol = discretize(get_polarity(c['sentiment']), plrty) metafeatures.append([c_wc, c_rnk, c_elp, c_sec]) print 'Extracted text (features) and class labels' if not metadata: return (features, labels) else: return (features, metafeatures, labels)
def reads_from_fastq_file(file_name, size_read_buffer = 10**8): fid = None if file_name == '-': fid = sys.stdin elif file_name.lower().endswith('.gz'): fid = gzip.open(file_name,'r') else: fid = open(file_name,'r') piece = [None,None,None,None] ij = 0 while True: gc.disable() lines = fid.readlines(size_read_buffer) gc.enable() if not lines: break for line in lines: ij = ij + 1 piece[ij-1] = line if ij == 4: bucket = (piece[0].rstrip('\r\n')[1:], piece[1].rstrip('\r\n'), piece[3].rstrip('\r\n')) yield bucket piece = [None,None,None,None] ij = 0 fid.close()
def reads_from_fastq_file(f_name,size_read_buffer=10**8): fid = None if f_name == '-': fid = sys.stdin elif f_name.lower().endswith('.gz'): fid = gzip.open(f_name,'r') else: fid = open(f_name,'r') j = 0 p1 = None p2 = None while True: gc.disable() lines = fid.readlines(size_read_buffer) gc.enable() if not lines: break for a_line in lines: j = j + 1 if j == 1: p1 = a_line elif j == 2: p2 = a_line elif j == 4: yield (p1,p2,a_line) p1 = None p2 = None j = 0 fid.close()
def add_line(self,line): gc.disable() self.data.append(line) gc.enable() self.size = self.size + len(line) if self.size > self.size_buffer: self.__write_buffer()
def main(): for key_file in glob.glob("*.pem"): with open(key_file, "r") as f: private_key = f.read() # Check that the signatures match. results = [] for name, func in get_signature.items(): if not available[name]: continue results.append((name, func(private_key, "foo bar"))) print "{} using {}:".format(key_file, ", ".join(r[0] for r in results)) signatures = dict((r[1], True) for r in results).keys() if len(signatures) == 1: print " EQUAL" else: print " NOT EQUAL" # Simple benchmark. iters = 500 s = get_random_string(500) for name, func in get_signature.items(): if not available[name]: continue print "running {} iterations of {}".format(iters, name) gc.disable() tic = time.time() for i in range(iters): func(private_key, s) toc = time.time() gc.enable() print " took {:.3f}s".format(toc - tic) print
def _main(): """ main loop """ user = None GAME_SELECT_DELAY = .4 while True: gc.disable() hardware.reset() if user is None: user = persistence.get_anonymous() # do game selection by good/bad light hardware.write_message("Waiting for a game selection"," Choose 1 - %d" % len(games)).\ display_characters('H','I') select = hardware.select_by_lights(len(games),9) if select == 9: for i in xrange(5): hardware.display_characters('B','Y')\ .wait(.3)\ .display_characters(' ',' ')\ .wait(.2) hardware.wait(1)\ .cleanup() exit() # game picked, construct it (name,description,levels,author,date,ver) = games[select-1].GameInfo() game = games[select-1]() level = 1 if levels > 1: hardware.display_characters('L','E') level = hardware.select_by_lights(levels,9) if level == 9: continue hardware.display_number(0) game.initialize(hardware,user,level) hardware.write_message("Playing game>",name) hardware.write_debug(description,'by',author) score = Score().load_at_start(name,ver,level,user) persistence.save_score_start(score,user) start = time.time() score.score = game.play() score.duration_sec = time.time() - start persistence.save_score_end(score,user) hardware.beep(2,.5) hardware.blink_light_until_button(5) gc.enable() gc.collect()
def load(self, file): # "file" could also be a socket gc.disable() try: return pickle.load(file) finally: gc.enable()
def loadblk(self, blk, buf): # we are in sighandler - establish cycle which also referenced obj_4del and trigger full GC assert self.obj_4del is not None w = weakref.ref(self.obj_4del) assert w() is self.obj_4del # establish cycle with leaf ref to obj_4del a = C() b = C() a.b = b b.a = a a.obj_4del = self.obj_4del self.obj_4del = None assert w() is not None # del a=b cycle - it should stay alice, while gc is disabled gc_save = gc.isenabled() gc.disable() del a, b assert w() is not None # gc - a=b and obj_4del collected gc.collect() assert w() is None if gc_save: gc.enable() self.marker_list.append(2)
def std_filter(use_cols, nrows=5000, threshold=0.02): data1 = pd.read_csv('../output/tar/train_pre_agg_0-10000.csv', nrows=nrows, usecols=use_cols) data2 = pd.read_csv('../output/tar/train_pre_agg_10000-50000.csv', nrows=nrows, usecols=use_cols) data3 = pd.read_csv('../output/tar/train_pre_agg_6-10.csv', nrows=nrows, usecols=use_cols) data4 = pd.read_csv('../output/tar/train_pre_agg_10-20.csv', nrows=nrows, usecols=use_cols) data5 = pd.read_csv('../output/tar/train_pre_agg_20-30.csv', nrows=nrows, usecols=use_cols) data6 = pd.read_csv('../output/tar/train_pre_agg_30+.csv', nrows=nrows, usecols=use_cols) datas = [data1, data2, data3, data4, data5, data6] to_drop = [] for data in datas: data_std = np.std(data, axis=0) cols_drop = data_std.index[data_std < threshold] print('\nmissing feature > {} {}'.format(threshold, len(cols_drop))) to_drop = list(set(set(cols_drop) | set(to_drop))) print('to drop length {}'.format(len(to_drop))) use_cols = [col for col in data1.columns if col not in to_drop] # pprint(use_cols) del data1, data2, data3, data4, data5, data6, datas gc.enable() return use_cols
def nan_filter(nrows=5000, missing_thread=0.9): data1 = pd.read_csv('../output/tar/train_pre_agg_0-10000.csv', nrows=nrows) data2 = pd.read_csv('../output/tar/train_pre_agg_10000-50000.csv', nrows=nrows) data3 = pd.read_csv('../output/tar/train_pre_agg_6-10.csv', nrows=nrows) data4 = pd.read_csv('../output/tar/train_pre_agg_10-20.csv', nrows=nrows) data5 = pd.read_csv('../output/tar/train_pre_agg_20-30.csv', nrows=nrows) data6 = pd.read_csv('../output/tar/train_pre_agg_30+.csv', nrows=nrows) datas = [data1, data2, data3, data4, data5, data6] to_drop = [] for data in datas: data_missing = (data.isnull().sum() / len(data)).sort_values(ascending=False) data_missing = data_missing.index[data_missing > missing_thread] print('\nmissing feature > {} {}'.format(missing_thread, len(data_missing))) to_drop = list(set(set(data_missing) | set(to_drop))) print('to drop length {}'.format(len(to_drop))) use_cols = [col for col in data1.columns if col not in to_drop] # pprint(use_cols) del data1, data2, data3, data4, data5, data6, datas gc.enable() return use_cols
def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def f(*k, **kw): try: gc.disable() ret = fun(*k, **kw) finally: gc.enable() return ret
def checkMemory(): """as the name says""" # pylint: disable=too-many-branches if not Debug.gc: return gc.set_threshold(0) gc.set_debug(gc.DEBUG_LEAK) gc.enable() print('collecting {{{') gc.collect() # we want to eliminate all output print('}}} done') # code like this may help to find specific things if True: # pylint: disable=using-constant-test interesting = ('Client', 'Player', 'Game') for obj in gc.garbage: if hasattr(obj, 'cell_contents'): obj = obj.cell_contents if not any(x in repr(obj) for x in interesting): continue for referrer in gc.get_referrers(obj): if referrer is gc.garbage: continue if hasattr(referrer, 'cell_contents'): referrer = referrer.cell_contents if referrer.__class__.__name__ in interesting: for referent in gc.get_referents(referrer): print('%s refers to %s' % (referrer, referent)) else: print('referrer of %s/%s is: id=%s type=%s %s' % (type(obj), obj, id(referrer), type(referrer), referrer)) print('unreachable:%s' % gc.collect()) gc.set_debug(0)
def timer(fxn, args): gc.disable() t1 = time.time() R = fxn(*args) t2 = time.time() gc.enable() return R, (t2 - t1)
def newfunc(*args,**kargs): try: gc.disable() result = func( *args, **kargs) finally: gc.enable() return result
def next_seed(self): """ Load next seed from disk """ seed = next(self._all_seeds) folder = os.path.join(self._root, str(seed), self._subset) self.data = [] silence = None gc.disable() for filename in os.listdir(folder): command = os.path.splitext(os.path.basename(filename))[0] with open(os.path.join(folder, filename), "r") as pkl_file: audio = pickle.load(pkl_file) # Check for 'silence' if command == "silence": silence = audio else: target = self.classes.index(os.path.basename(command)) self.data.extend(itertools.product(audio, [target])) gc.enable() target = self.classes.index("silence") self.data += [(silence, target)] * int(len(self.data) * self._silence_percentage) return seed
def compare_algos(fil): img = nb.load(fil) mask = img.get_data() > 0 print('Parameter initialization') S = init_classical(img, mask) map0 = S.map() mu0 = S.mu.copy() sigma0 = S.sigma.copy() print('Running classical VEM') e0, f0 = run_ve(S, niters=NITERS) jac0 = jaccard(S.map(), map0) map0 = S.map() tmp = e0[-1][0] + e0[-1][1] print('Final energy: %f' % tmp) print('Running Laplace relaxed VEM') S = init_laplace(img, mask, mu0, sigma0) jac1 = jaccard(S.map(), map0) e, f = run_ve(S, niters=NITERS) jac2 = jaccard(S.map(), map0) tmp = e[-1][0] + e[-1][1] print('Final energy: %f' % tmp) del img del mask del S del map0 gc.enable() gc.collect() return {'e0': e0, 'f0': f0, 'e': e, 'f': f, 'jac0': jac0, 'jac1': jac1, 'jac2': jac2}
def grab_frame(self): now = time.time() right_local_frame = None left_local_frame = None try: for i in range (4): ret1, right_local_frame = self.right_cam.read() time.sleep(.10) for i in range (4): ret2, left_local_frame = self.left_cam.read() except: pass self.capture_time = (time.time()-now) print "capture time:", self.capture_time if self.capture_time > 3 or right_local_frame == None or left_local_frame == None: #time.sleep(1) print "camera fault: recovering...", self.recovery_count self.recovery_count += 1 try: if self.right_cam != None or self.left_cam != None: self.right_cam.release self.left_cam.release gc.enable() gc.collect() self.initialize_camera() except: #time.sleep(.1) pass self.grab_frame() else: self.frame_count += 1 self.right_frame = right_local_frame self.left_frame = left_local_frame print 'frame count:', self.frame_count
def run(self): self.connect() self.initialize_camera(self.camera_num, self.x, self.y) while True: time.sleep(0.0001) #dont hog resources self.frame = None self.frame_count += 1 now = time.time() try: ret, self.frame = self.camera.read() except: pass self.capture_time = (time.time()-now) print 'frames:', self.frame_count , " capture time:", self.capture_time, " recovery_count:", self.recovery_count if self.capture_time > 0.9 or self.frame == None: self.frame = None while self.frame == None: self.recovery_count += 1 try: if self.camera != None: self.camera.release gc.enable() gc.collect() self.initialize_camera(self.camera_num, self.x, self.y) try: ret, self.frame = self.camera.read() except: pass except: time.sleep(.5) pass pickled_frame = pickle.dumps(self.frame,-1) self.publish(pickled_frame)
import gc class Human: def __init__(self, name): self.name = name self.head = self.Head() self.brain = self.head.Brain() self.display() def display(self): print("hello ", self.name) self.head.talk() self.brain.think() class Head: def talk(self): print("talking") class Brain: def think(self): print("Thinking") h = Human("anil") #print(gc.isenabled()) print(gc.disable()) print(gc.enable()) print(gc.isenabled())
def __init__(self, fname=None, fdata=None, decompress=False, decrypt=False, password='', disable_gc=True, verbose=True): self.private.verbose = verbose # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() if disable_gc: gc.disable() try: if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None fdata = convert_load(fdata) if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) self.private.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = {'<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source # Find all the xref tables/streams, and # then deal with them backwards. xref_list = [] while 1: source.obj_offsets = {} trailer, is_stream = self.parsexref(source) prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref' and not xref_list: source.warning('Expected "startxref" ' 'at end of xref table') break xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) # Handle document encryption private.crypt_filters = None if decrypt and PdfName.Encrypt in trailer: identity_filter = crypt.IdentityCryptFilter() crypt_filters = { PdfName.Identity: identity_filter } private.crypt_filters = crypt_filters private.stream_crypt_filter = identity_filter private.string_crypt_filter = identity_filter if not crypt.HAS_CRYPTO: raise PdfParseError( 'Install PyCrypto to enable encryption support') self._parse_encrypt_info(source, password, trailer) if is_stream: self.load_stream_objects(trailer.object_streams) while xref_list: later_offsets, later_trailer, is_stream = xref_list.pop() source.obj_offsets.update(later_offsets) if is_stream: trailer.update(later_trailer) self.load_stream_objects(later_trailer.object_streams) else: trailer = later_trailer trailer.Prev = None if (trailer.Version and float(trailer.Version) > float(self.version)): self.private.version = trailer.Version if decrypt: self.decrypt_all() trailer.Encrypt = None if is_stream: self.Root = trailer.Root self.Info = trailer.Info self.ID = trailer.ID self.Size = trailer.Size self.Encrypt = trailer.Encrypt else: self.update(trailer) # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() # For compatibility with pyPdf private.numPages = len(self.pages) finally: if disable_gc: gc.enable()
def setUp(self): gc.enable()
def split_reads(f_in, f_list, f_out_1, f_out_2, wiggle=0, gap=0, anchor=15, anchor_max=500, replace_solexa_ids="", rc=False, size_buffer=2 * (10**9)): data1 = lines_to_file(f_out_1) data2 = lines_to_file(f_out_2) fid = open(f_list, 'r') reads = [] while True: p = fid.tell() err = True sb = size_buffer if sb == 0: sb = 2 * (10**9) while err: gc.disable() try: lines = fid.readlines(sb) except MemoryError: print >> sys.stderr, "Warning: Not enough free memory (it needed %d)!!! Trying again with a 50% smaller buffer..." % ( sb, ) sb = int(sb / 2) if sb < 10000000: print >> sys.stderr, "Error: Not enough free memory (it needed %d)!!! Giving up..." % ( sb, ) os.system("free -m") sys.exit(1) err = True fid.seek(p) else: err = False gc.enable() if not lines: break gc.disable() reads = [line.rstrip('\r\n').partition("\t") for line in lines] gc.enable() gc.disable() r = dict() wiggle_range = range(-wiggle, wiggle + 1) for line in reads: k = line[0] if not r.has_key(k): r[k] = set() w = int(line[2]) for wig in wiggle_range: r[k].add(w + wig) reads = r gc.enable() am1 = anchor - 1 am2 = anchor - 2 for read in reads_from_fastq_file(f_in): v = reads.get(read[0][1:].rstrip('\r\n'), None) if not v: continue v = list(v) i = 0 unique = set() if gap != 0: for agap in xrange(1, gap + 1): for cut in v: if cut + 1 - agap > anchor - 1: k1 = cut + 1 - agap k2 = cut + 1 if (k1, k2) not in unique: w = givemeid(replace_solexa_ids, read[0][:-1], i) # if replace_solexa_ids: # w = read[0][:-1].replace("/",replace_solexa_ids,1)+'__'+int2str(i) # else: # w = read[0][:-1]+'__'+int2str(i) r1a = read[1][0:cut + 1 - agap] r2a = read[2][0:cut + 1 - agap] r1b = read[1][cut + 1:] r2b = read[2][cut + 1:] lr1a = len(r1a) lr1b = len(r1b) if lr1a > am1 and lr1b > am2: data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 unique.add((k1, k2)) flag = True # trim only one end and not both ends if lr1a > anchor_max: r1a = r1a[-anchor_max:] r2a = r2a[-anchor_max:] flag = False if lr1b > anchor_max and flag: r1b = r1b[:anchor_max] r2b = r2b[:anchor_max] flag = False if flag == False: w = givemeid(replace_solexa_ids, read[0][:-1], i) data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 if len(read[1]) - (cut + 1 + agap) > anchor - 1: k1 = cut + 1 k2 = cut + 1 + agap if (k1, k2) not in unique: w = givemeid(replace_solexa_ids, read[0][:-1], i) # if replace_solexa_ids: # w = read[0][:-1].replace("/",replace_solexa_ids,1)+'__'+int2str(i) # else: # w = read[0][:-1]+'__'+int2str(i) r1a = read[1][0:cut + 1] r2a = read[2][0:cut + 1] r1b = read[1][cut + 1 + agap:] r2b = read[2][cut + 1 + agap:] lr1a = len(r1a) lr1b = len(r1b) if lr1a > am1 and lr1b > am2: data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 unique.add((k1, k2)) flag = True # trim only one end and not both ends if lr1a > anchor_max: r1a = r1a[-anchor_max:] r2a = r2a[-anchor_max:] flag = False if lr1b > anchor_max and flag: r1b = r1b[:anchor_max] r2b = r2b[:anchor_max] flag = False if flag == False: w = givemeid(replace_solexa_ids, read[0][:-1], i) data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 else: for cut in v: w = givemeid(replace_solexa_ids, read[0][:-1], i) # if replace_solexa_ids: # w = read[0][:-1].replace("/",replace_solexa_ids,1)+'__'+int2str(i) # else: # w = read[0][:-1]+'__'+int2str(i) r1a = read[1][0:cut + 1] r2a = read[2][0:cut + 1] r1b = read[1][cut + 1:] r2b = read[2][cut + 1:] lr1a = len(r1a) lr1b = len(r1b) if lr1a > am1 and lr1b > am2: data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 flag = True # trim only one end and not both ends if lr1a > anchor_max: r1a = r1a[-anchor_max:] r2a = r2a[-anchor_max:] flag = False if lr1b > anchor_max and flag: r1b = r1b[:anchor_max] r2b = r2b[:anchor_max] flag = False if flag == False: w = givemeid(replace_solexa_ids, read[0][:-1], i) data1.add_line("%sa\n%s\n+\n%s\n" % (w, r1a, r2a)) if rc: data2.add_line( "%sb\n%s\n+\n%s\n" % (w, reversecomplement(r1b), reverse(r2b))) else: data2.add_line("%sb\n%s+\n%s" % (w, r1b, r2b)) i = i + 1 data1.close() data2.close() fid.close()
def test_enable(self): gc.enable() result = gc.isenabled() self.assertTrue(result,"enable Method can't set gc.isenabled as true.")
def Network_config(class_num=4, epoch=200, initial_epoch=0, batch_size=32, train_data=None, train_label=None, test_data=None, test_label=None, fold=0): adam = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.000) sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False) K.set_learning_phase(1) base_model = InceptionV3(input_tensor=Input(shape=(299, 299, 3)), weights='imagenet', include_top=False) x = base_model.output # K.set_learning_phase(1) x = GlobalAveragePooling2D()(x) x = Dense(512, activation='relu')(x) x = BatchNormalization()(x) x = Dense(512, activation='relu')(x) x = BatchNormalization()(x) predictions = Dense(class_num, activation='softmax')(x) # this is the model we will train model = Model(inputs=base_model.input, outputs=predictions) for layer in (base_model.layers): layer.trainable = False if layer.name.startswith('bn') or 'bn' in layer.name: layer.call(layer.input, training=False) model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[keras.metrics.categorical_accuracy]) tools.create_directory('./tmpinception/') weights_file = './tmpinception/' + str(fold)+'-weights.{epoch:02d}-{categorical_accuracy:.4f}-{val_loss:.4f}-{val_categorical_accuracy:.4f}.h5' csv_file = './tmpinception/record.csv' lr_reducer = ReduceLROnPlateau(monitor='categorical_accuracy', factor=0.5, cooldown=0, patience=5, min_lr=0.5e-6) early_stopper = EarlyStopping(monitor='val_categorical_accuracy', min_delta=1e-4, patience=50) model_checkpoint = ModelCheckpoint(weights_file, monitor='val_categorical_accuracy', save_best_only=True, verbose=2, save_weights_only=True, mode='max') tensorboard = TensorBoard(log_dir='./logs/', histogram_freq=0, batch_size=8, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) CSV_record = CSVLogger(csv_file, separator=',', append=True) callbacks = [lr_reducer, early_stopper, model_checkpoint, tensorboard, CSV_record] gc.disable() model.fit_generator( generator=tools.batch_generator(np.array(train_data), np.array(train_label), batch_size, True, class_num), steps_per_epoch=int(len(train_label)/batch_size)-1, max_q_size=20, initial_epoch=initial_epoch, epochs=epoch, verbose=1, callbacks=callbacks, validation_data=tools.batch_generator(np.array(test_data), np.array(test_label), batch_size, True, class_num), validation_steps=int(len(test_label)/batch_size)-1, class_weight='auto') all_y_pred = [] all_y_true = [] for test_data_batch, test_label_batch in tools.batch_generator_confusion_matrix(np.array(test_data),np.array(test_label), batch_size, True, class_num): y_pred = model.predict(test_data_batch, batch_size) y_true = test_label_batch for y_p in y_pred: all_y_pred.append(np.where(y_p == max(y_p))[0][0]) for y_t in y_true: all_y_true.append(np.where(y_t == max(y_t))[0][0]) confusion = confusion_matrix(y_true=all_y_true,y_pred=all_y_pred) print(confusion) f = open('confusion_matrix.txt','a+') f.write(str(all_y_true)+"\n") f.write(str(all_y_pred)+"\n") f.write(str(confusion)+'\n') f.close() gc.enable()
def run_command(command=None, parser=None, args=None, name='unknown', data=None, options=None): """ Execute a function that processes command-line arguments and then calls a command-line driver. This function provides a generic facility for executing a command function is rather generic. This function is segregated from the driver to enable profiling of the command-line execution. Required: command: The name of a function that will be executed to perform process the command-line options with a parser object. parser: The parser object that is used by the command-line function. Optional: options: If this is not None, then ignore the args option and use this to specify command options. args: Command-line arguments that are parsed. If this value is `None`, then the arguments in `sys.argv` are used to parse the command-line. name: Specifying the name of the command-line (for error messages). data: A container of labeled data. Returned: retval: Return values from the command-line execution. errorcode: 0 if Pyomo ran successfully """ # # # Parse command-line options # # retval = None errorcode = 0 if options is None: try: if type(args) is argparse.Namespace: _options = args else: _options = parser.parse_args(args=args) # Replace the parser options object with a pyutilib.misc.Options object options = pyutilib.misc.Options() for key in dir(_options): if key[0] != '_': val = getattr(_options, key) if not isinstance(val, types.MethodType): options[key] = val except SystemExit: # the parser throws a system exit if "-h" is specified - catch # it to exit gracefully. return Container(retval=retval, errorcode=errorcode) # # Configure loggers # configure_loggers(options=options) # # Call the main Pyomo runner with profiling # TempfileManager.push() pcount = options.runtime.profile_count if pcount > 0: # Defer import of profiling packages until we know that they # are needed try: try: import cProfile as profile except ImportError: import profile import pstats except ImportError: configure_loggers(shutdown=True) raise ValueError( "Cannot use the 'profile' option: the Python " "'profile' or 'pstats' package cannot be imported!") tfile = TempfileManager.create_tempfile(suffix=".profile") tmp = profile.runctx( command.__name__ + '(options=options,parser=parser)', command.__globals__, locals(), tfile) p = pstats.Stats(tfile).strip_dirs() p.sort_stats('time', 'cumulative') p = p.print_stats(pcount) p.print_callers(pcount) p.print_callees(pcount) p = p.sort_stats('cumulative', 'calls') p.print_stats(pcount) p.print_callers(pcount) p.print_callees(pcount) p = p.sort_stats('calls') p.print_stats(pcount) p.print_callers(pcount) p.print_callees(pcount) retval = tmp else: # # Call the main Pyomo runner without profiling # TempfileManager.push() try: retval = command(options=options, parser=parser) except SystemExit: err = sys.exc_info()[1] # # If debugging is enabled or the 'catch' option is specified, then # exit. Otherwise, print an "Exiting..." message. # if __debug__ and (options.runtime.logging == 'debug' or options.runtime.catch_errors): configure_loggers(shutdown=True) sys.exit(0) print('Exiting %s: %s' % (name, str(err))) errorcode = err.code except Exception: err = sys.exc_info()[1] # # If debugging is enabled or the 'catch' option is specified, then # pass the exception up the chain (to pyomo_excepthook) # if __debug__ and (options.runtime.logging == 'debug' or options.runtime.catch_errors): configure_loggers(shutdown=True) TempfileManager.pop(remove=not options.runtime.keep_files) raise if not options.model is None and not options.model.save_file is None: model = "model " + options.model.save_file else: model = "model" global filter_excepthook if filter_excepthook: action = "loading" else: action = "running" msg = "Unexpected exception while %s %s:\n " % (action, model) # # This handles the case where the error is propagated by a KeyError. # KeyError likes to pass raw strings that don't handle newlines # (they translate "\n" to "\\n"), as well as tacking on single # quotes at either end of the error message. This undoes all that. # errStr = str(err) if type(err) == KeyError and errStr != "None": errStr = str(err).replace(r"\n", "\n")[1:-1] logger.error(msg + errStr) errorcode = 1 configure_loggers(shutdown=True) if options.runtime.disable_gc: gc.enable() TempfileManager.pop(remove=not options.runtime.keep_files) return Container(retval=retval, errorcode=errorcode)
def tearDown(self): self.graph.close() if self.gcold: gc.enable() # TODO: delete a_tmp_dir self.graph.close()
def _execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite): """Execute program (POSIX version)""" if isinstance(args, types.StringTypes): args = [args] else: args = list(args) if shell: args = ['/bin/sh', '-c'] + args if executable: args[0] = executable if executable is None: executable = args[0] errpipe_read, errpipe_write = os.pipe() try: try: self._set_cloexec_flag(errpipe_write) gc_was_enabled = gc.isenabled() gc.disable() try: self.pid = os.fork() except: if gc_was_enabled: gc.enable() raise self._child_created = True if self.pid == 0: try: if p2cwrite is not None: os.close(p2cwrite) if c2pread is not None: os.close(c2pread) if errread is not None: os.close(errread) os.close(errpipe_read) if p2cread is not None: os.dup2(p2cread, 0) if c2pwrite is not None: os.dup2(c2pwrite, 1) if errwrite is not None: os.dup2(errwrite, 2) if p2cread is not None and p2cread not in (0,): os.close(p2cread) if c2pwrite is not None and c2pwrite not in (p2cread, 1): os.close(c2pwrite) if errwrite is not None and errwrite not in (p2cread, c2pwrite, 2): os.close(errwrite) if close_fds: self._close_fds(but=errpipe_write) if cwd is not None: os.chdir(cwd) if preexec_fn: preexec_fn() if env is None: os.execvp(executable, args) else: os.execvpe(executable, args, env) except: exc_type, exc_value, tb = sys.exc_info() exc_lines = traceback.format_exception(exc_type, exc_value, tb) exc_value.child_traceback = ''.join(exc_lines) os.write(errpipe_write, pickle.dumps(exc_value)) os._exit(255) if gc_was_enabled: gc.enable() finally: os.close(errpipe_write) if p2cread is not None and p2cwrite is not None: os.close(p2cread) if c2pwrite is not None and c2pread is not None: os.close(c2pwrite) if errwrite is not None and errread is not None: os.close(errwrite) data = _eintr_retry_call(os.read, errpipe_read, 1048576) finally: os.close(errpipe_read) if data != '': _eintr_retry_call(os.waitpid, self.pid, 0) child_exception = pickle.loads(data) for fd in (p2cwrite, c2pread, errread): if fd is not None: os.close(fd) raise child_exception
def test_global_gc_when_full(shutdown_only): cluster = ray.cluster_utils.Cluster() for _ in range(2): cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 1024 * 1024) ray.init(address=cluster.address) class LargeObjectWithCyclicRef: def __init__(self): self.loop = self self.large_object = ray.put( np.zeros(40 * 1024 * 1024, dtype=np.uint8)) @ray.remote(num_cpus=1) class GarbageHolder: def __init__(self): gc.disable() x = LargeObjectWithCyclicRef() self.garbage = weakref.ref(x) def has_garbage(self): return self.garbage() is not None def return_large_array(self): return np.zeros(80 * 1024 * 1024, dtype=np.uint8) try: gc.disable() # Local driver. local_ref = weakref.ref(LargeObjectWithCyclicRef()) # Remote workers. actors = [GarbageHolder.remote() for _ in range(2)] assert local_ref() is not None assert all(ray.get([a.has_garbage.remote() for a in actors])) # GC should be triggered for all workers, including the local driver, # when the driver tries to ray.put a value that doesn't fit in the # object store. This should cause the captured ObjectRefs' numpy arrays # to be evicted. ray.put(np.zeros(80 * 1024 * 1024, dtype=np.uint8)) def check_refs_gced(): return (local_ref() is None and not any(ray.get([a.has_garbage.remote() for a in actors]))) wait_for_condition(check_refs_gced) # Local driver. local_ref = weakref.ref(LargeObjectWithCyclicRef()) # Remote workers. actors = [GarbageHolder.remote() for _ in range(2)] assert all(ray.get([a.has_garbage.remote() for a in actors])) # GC should be triggered for all workers, including the local driver, # when a remote task tries to put a return value that doesn't fit in # the object store. This should cause the captured ObjectRefs' numpy # arrays to be evicted. ray.get(actors[0].return_large_array.remote()) def check_refs_gced(): return (local_ref() is None and not any(ray.get([a.has_garbage.remote() for a in actors]))) wait_for_condition(check_refs_gced) finally: gc.enable()
from setuptools_scm.git import parse kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'" return parse(root, **kwargs) __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: __version__ = None # ARROW-8684: Disable GC while initializing Cython extension module, # to workaround Cython bug in https://github.com/cython/cython/issues/3603 _gc_enabled = _gc.isenabled() _gc.disable() import pyarrow.lib as _lib if _gc_enabled: _gc.enable() from pyarrow.lib import (BuildInfo, RuntimeInfo, VersionInfo, cpp_build_info, cpp_version, cpp_version_info, runtime_info, cpu_count, set_cpu_count, enable_signal_handlers, io_thread_count, set_io_thread_count) def show_versions(): """ Print various version information, to help with error reporting. """ # TODO: CPU information and flags print("pyarrow version info\n--------------------") print("Package kind: {}".format(cpp_build_info.package_kind if len( cpp_build_info.package_kind) > 0 else "not indicated"))
def identify_zero_importance(self, task, eval_metric=None, n_iterations=10, early_stopping=True): """ Identify the features with zero importance according to a gradient boosting machine. The gbm can be trained with early stopping using a validation set to prevent overfitting. The feature importances are averaged over `n_iterations` to reduce variance. Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html) Parameters -------- eval_metric : string Evaluation metric to use for the gradient boosting machine for early stopping. Must be provided if `early_stopping` is True task : string The machine learning task, either 'classification' or 'regression' n_iterations : int, default = 10 Number of iterations to train the gradient boosting machine early_stopping : boolean, default = True Whether or not to use early stopping with a validation set when training Notes -------- - Features are one-hot encoded to handle the categorical variables before training. - The gbm is not optimized for any particular task and might need some hyperparameter tuning - Feature importances, including zero importance features, can change across runs """ if early_stopping and eval_metric is None: raise ValueError( """eval metric must be provided with early stopping. Examples include "auc" for classification or "l2" for regression.""") if self.labels is None: raise ValueError("No training labels provided.") # One hot encoding features = pd.get_dummies(self.data) self.one_hot_features = [ column for column in features.columns if column not in self.base_features ] # Add one hot encoded data to original data self.data_all = pd.concat([features[self.one_hot_features], self.data], axis=1) # Extract feature names feature_names = list(features.columns) # Convert to np array features = np.array(features) labels = np.array(self.labels).reshape((-1, )) # Empty array for feature importances feature_importance_values = np.zeros(len(feature_names)) print('Training Gradient Boosting Model\n') # Iterate through each fold for _ in range(n_iterations): if task == 'classification': model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, verbose=0) elif task == 'regression': model = lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=0) else: raise ValueError( 'Task must be either "classification" or "regression"') # If training using early stopping need a validation set if early_stopping: train_features, valid_features, train_labels, valid_labels = train_test_split( features, labels, test_size=0.3) # Train the model with early stopping model.fit(train_features, train_labels, eval_metric='rmse', eval_set=[(valid_features, valid_labels)], early_stopping_rounds=100, verbose=-1) # Clean up memory gc.enable() del train_features, train_labels, valid_features, valid_labels gc.collect() else: model.fit(features, labels) # Record the feature importances feature_importance_values += model.feature_importances_ / n_iterations feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) # Sort features according to importance feature_importances = feature_importances.sort_values( 'importance', ascending=False).reset_index(drop=True) # Normalize the feature importances to add up to one feature_importances['normalized_importance'] = feature_importances[ 'importance'] / feature_importances['importance'].sum() feature_importances['cumulative_importance'] = np.cumsum( feature_importances['normalized_importance']) # Extract the features with zero importance record_zero_importance = feature_importances[ feature_importances['importance'] == 0.0] to_drop = list(record_zero_importance['feature']) self.feature_importances = feature_importances self.record_zero_importance = record_zero_importance self.ops['zero_importance'] = to_drop print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance']))
def train_model(): print('innn') import tensorflow as tf from tensorflow import keras from PIL import Image import numpy as np import gc global log print(request.json['folder']) print(request.json['lr']) print(request.json['epochs']) folder_name = request.json['folder'] log = r"Initializing variables...." gc.enable() base_dir = os.path.join(app.config['UPLOAD_FOLDER'],folder_name) train_dir = os.path.join(base_dir, 'train') validation_dir = os.path.join(base_dir, 'validation') image_size = 160 batch_size = 32 if(request.json['epochs']): epochs = int(request.json['epochs']) else: epochs = 1 if(request.json['lr']): lr = int(request.json['lr']) else: lr = 0.001 train_datagen = keras.preprocessing.image.ImageDataGenerator( rescale=1./255) validation_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255) train_generator = train_datagen.flow_from_directory( train_dir, target_size=(image_size, image_size), batch_size=batch_size, class_mode='sparse') validation_generator = validation_datagen.flow_from_directory( validation_dir, target_size=(image_size, image_size), batch_size=batch_size, class_mode='sparse') IMG_SHAPE = (image_size, image_size, 3) log = r'Creating Model Base to Train' base_model = keras.applications.MobileNetV2(input_shape=IMG_SHAPE, include_top=False, weights='imagenet',classes=2) base_model.trainable = False model = keras.Sequential([base_model, keras.layers.GlobalAveragePooling2D(), keras.layers.Dense(2,activation='sigmoid')]) log = r'Compiling the Model' model.compile(optimizer=keras.optimizers.RMSprop(lr=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) steps_per_epoch = train_generator.n // batch_size validation_steps = validation_generator.n // batch_size print("training started") log = r"Training the Model." model.fit_generator(train_generator, steps_per_epoch = steps_per_epoch, epochs=epochs, workers=4, validation_data=validation_generator, validation_steps=validation_steps) log = r"Dense Layer's Trainig done..." fine_tune_at = 100 # Freeze all the layers before the `fine_tune_at` layer for layer in base_model.layers[:fine_tune_at]: layer.trainable = False model.compile(optimizer = tf.keras.optimizers.RMSprop(lr=2e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy']) log = r'Fine Tuning the model, beginning to train.' hist = model.fit_generator(train_generator, steps_per_epoch = steps_per_epoch, epochs=epochs, workers=4, validation_data=validation_generator, validation_steps=validation_steps) log = r'Model is done training,Acc:{}, Loss:{}, Val_acc:{}, Val_loss:{}'.format(hist.history['acc'],hist.history['loss'],hist.history['val_acc'],hist.history['val_loss']) model_path = base_dir + '_model_without_fine_tune.h5' model.save(model_path) # gc.collect() print(model_path) print(folder_name + '_model_without_fine_tune.h5') return jsonify({"success":True,"modelLink": folder_name + '_model_without_fine_tune.h5'})
def sra2illumina(input_file, output_file, tag_read = None, tag='', phred_conversion = False, operation = 'change', tmp_dir = None, size_read_buffer = 10**8): """ It converts the FASTQ file (PHRED-33 qualities and SRA read names) downloaded from Short Read Archive (SRA) to Illumina FASTQ file (PHRED-64 Illumina v1.5 and Illumina read names). """ temp_file = None if phred_conversion: temp_file = give_me_temp_filename(tmp_dir) else: temp_file = output_file read_name = file(input_file,'r').readline().rstrip('\r\n') sra = False e = read_name.partition(" ")[0] if read_name.startswith('@') and ( not(e.endswith('/1') or e.endswith('/2'))): sra = True if operation == 'change' or sra: fid = open(input_file,'r') fod = open(temp_file,'w') i = 0 r = 0 while True: gc.disable() lines = fid.readlines(size_read_buffer) gc.enable() if not lines: break n = len(lines) for j in xrange(n): r = r + 1 i = i + 1 if i == 1: if tag_read: lines[j] = '@%s%s%s\n' % (tag_read ,int2str(r,12) , tag) else: # if there is no tag_read then the original SRA id is left lines[j] = '%s%s\n' % (lines[j][:-1].partition(" ")[0], tag) #lines[j] = lines[j].rstrip('\r\n').upper().split(' ')[1]+tag+'\n' elif i == 3: lines[j] = "+\n" elif i == 4: i = 0 fod.writelines(lines) fid.close() fod.close() if phred_conversion == '64': phred.fq2fq(temp_file,'sanger',output_file,'illumina-1.5',tmp_dir = tmp_dir) os.remove(temp_file) elif phred_conversion == '33': phred.fq2fq(temp_file,'auto-detect',output_file,'sanger',tmp_dir = tmp_dir) os.remove(temp_file) else: print "No changes are done!" if os.path.isfile(output_file): os.remove(output_file) if operation == 'soft': if os.path.islink(input_file): linkto = os.readlink(input_file) os.symlink(linkto,ooutput_file) else: os.symlink(input_file,output_file) elif operation == 'hard': linkto = input_file if os.path.islink(input_file): linkto = os.readlink(input_file) try: os.link(linkto,output_file) except OSError as er: print >>sys.stderr,"WARNING: Cannot do hard links ('%s' and '%s')!" % (linkto,output_file) shutil.copyfile(linkto,output_file) # if er.errno == errno.EXDEV: # # they are on different partitions # # [Errno 18] Invalid cross-device link # shutil.copyfile(linkto,output_file) # else: # print >>sys.stderr,"ERROR: Cannot do hard links ('%s' and '%s')!" % (linkto,output_file) # print >>sys.stderr,er # sys.exit(1) elif operation == 'copy': shutil.copyfile(input_file, output_file) else: print >>sys.stderr, "ERROR: unknown operation of linking!", operation sys.exit(1)
def merge_star_chimeric(psl_in, psl_ou): # psl = [] fou = None if psl_ou == '-': fou = sys.stdout else: fou = open(psl_ou, 'w') limit_psl = 10**5 for box in chunks(psl_in): if len(box) == 2: if box[0][psl_strand] != box[1][psl_strand]: continue merged = None temp = box[0][:] r1_start = int(box[0][psl_qStart]) r2_start = int(box[1][psl_qStart]) if r1_start > r2_start: box = (box[1], box[0]) r1_start = int(box[0][psl_qStart]) r1_end = int(box[0][psl_qEnd]) r2_start = int(box[1][psl_qStart]) r2_end = int(box[1][psl_qEnd]) t1_start = int(box[0][psl_tStart]) t1_end = int(box[0][psl_tEnd]) t2_start = int(box[1][psl_tStart]) t2_end = int(box[1][psl_tEnd]) if t1_start > t2_start: continue wiggle = 9 if r1_end + wiggle > r2_start and r1_end < r2_start: dif = r2_start - r1_end # extend the first #box[0][psl_matches] = str(int(box[0][psl_matches])) #box[0][psl_misMatches] = str(int(box[0][psl_misMatches]) + dif) box[0][psl_qEnd] = str(int(box[0][psl_qEnd]) + dif) box[0][psl_tEnd] = str(int(box[0][psl_tEnd]) + dif) t = box[0][psl_blockSizes].split(',') t[-2] = str(int(t[-2]) + dif) box[0][psl_blockSizes] = ','.join(t) # recompute r1_start = int(box[0][psl_qStart]) r1_end = int(box[0][psl_qEnd]) t1_start = int(box[0][psl_tStart]) t1_end = int(box[0][psl_tEnd]) elif r1_end > r2_start and r1_end < r2_start + wiggle: dif = r2_start - r1_end # cut the second box[1][psl_matches] = str(int(box[1][psl_matches]) - dif) box[1][psl_misMatches] = str(int(box[1][psl_misMatches]) + dif) box[1][psl_qStart] = str(int(box[1][psl_qStart]) + dif) box[1][psl_tStart] = str(int(box[1][psl_tStart]) + dif) t = box[1][psl_blockSizes].split(',') t[0] = str(int(t[0]) - dif) box[1][psl_blockSizes] = ','.join(t) t = box[1][psl_qStarts].split(',') t[0] = str(int(t[0]) + dif) box[1][psl_qStarts] = ','.join(t) t = box[1][psl_tStarts].split(',') t[0] = str(int(t[0]) + dif) box[1][psl_tStarts] = ','.join(t) # recompute r2_start = int(box[1][psl_qStart]) r2_end = int(box[1][psl_qEnd]) t2_start = int(box[1][psl_tStart]) t2_end = int(box[1][psl_tEnd]) if r1_end <= r2_start and t1_end <= t2_start: #and box[0][psl_strand] == "+" : temp[psl_matches] = int(box[0][psl_matches]) + int( box[1][psl_matches]) temp[psl_misMatches] = int(box[0][psl_misMatches]) - int( box[1][psl_matches]) temp[psl_qNumInsert] = int(box[0][psl_qNumInsert]) + int( box[1][psl_qNumInsert]) temp[psl_qBaseInsert] = int(box[0][psl_qBaseInsert]) + int( box[1][psl_qBaseInsert]) temp[psl_tNumInsert] = int(box[0][psl_tNumInsert]) + int( box[1][psl_tNumInsert]) temp[psl_tBaseInsert] = int(box[0][psl_tBaseInsert]) + int( box[1][psl_tBaseInsert]) temp[psl_qStart] = r1_start temp[psl_qEnd] = r2_end temp[psl_tStart] = t1_start temp[psl_tEnd] = t2_end temp[psl_blockCount] = int(box[0][psl_blockCount]) + int( box[1][psl_blockCount]) temp[psl_blockSizes] = box[0][psl_blockSizes] + box[1][ psl_blockSizes] temp[psl_qStarts] = box[0][psl_qStarts] + box[1][psl_qStarts] temp[psl_tStarts] = box[0][psl_tStarts] + box[1][psl_tStarts] temp[psl_tNumInsert] = '1' merged = temp if merged: gc.disable() psl.append(map(str, merged)) gc.enable() if len(psl) >= limit_psl: fou.writelines(['\t'.join(line) + '\n' for line in psl]) psl = [] # output PSL if psl: fou.writelines(['\t'.join(line) + '\n' for line in psl])
from skimage.io import imread import matplotlib.pyplot as plt from skimage.segmentation import mark_boundaries # from skimage.util.montage import montage2d as montage from skimage.morphology import binary_opening, disk from sklearn.model_selection import train_test_split from skimage.morphology import label from keras.preprocessing.image import ImageDataGenerator from keras import models, layers import keras.backend as K from keras.optimizers import Adam from keras.losses import binary_crossentropy from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau from tqdm import tqdm import gc; gc.enable() # montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1) ship_dir = 'F:\\Shiga\\kaggle\\AirbusShipDetection' train_image_dir = os.path.join(ship_dir, 'train') test_image_dir = os.path.join(ship_dir, 'test') def multi_rle_encode(img): labels = label(img) if img.ndim > 2: return [rle_encode(np.sum(labels==k, axis=2)) for k in np.unique(labels[labels>0])] else: return [rle_encode(labels==k) for k in np.unique(labels[labels>0])] # ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
def post_hook(): import gc gc.enable()
def tearDown(self): if self.using_gc: gc.enable()
def train(model, task, y_list, x_list, checkpoint_dir, checkpoint_prefix, device, batch_size=512, max_seq_len=100, lr=1e-3, resume_surfix=None, logger=None): """ : model - torch.nn.module: model to be trained : task - list[tuple(int,list[int])]: epoch + file to train : y_list - list[str]: list of y variables : x_list - list[str]: list of x variables to generate embed sequence for : checkpoint_dir - str: path to checkpoint directory : checkpoint_prefix - str: prefix of checkpoint file : device - torch.device: device to train the model : batch_size - int: size of mini batch : max_seq_len - int: max length for sequence input, default 100 : lr - float: learning rate for Adam, default 1e-3 : resume_surfix - str: model to reload if not training from scratch """ global input_split_path, embed_path if not gc.isenabled(): gc.enable() # Check checkpoint directory if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # Calculate number of batch div, mod = divmod(90000, batch_size) batch_per_file = div + min(1, mod) batch_per_epoch = 9 * batch_per_file # Load model if not train from scratch loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=0, threshold=1e-5, threshold_mode='abs') if resume_surfix is not None: model_artifact_path = os.path.join(checkpoint_dir, '{}_{}.pth'.format(checkpoint_prefix, resume_surfix)) model.load_state_dict(torch.load(model_artifact_path)) if logger: logger.info('Model loaded from {}'.format(model_artifact_path)) optimizer_artifact_path = os.path.join(checkpoint_dir, '{}_{}_opti.pth'.format(checkpoint_prefix, resume_surfix)) if logger: logger.info('Optimizer loaded from {}'.format(optimizer_artifact_path)) model.to(device) # Initiate word vector host wv = wv_loader_v2(x_list, embed_path, max_seq_len=max_seq_len) if logger: logger.info('Word vector host ready') # Main Loop for epoch, file_idx_list in task: if logger: logger.info('=========================') logger.info('Processing Epoch {}/{}'.format(epoch, task[-1][0])) logger.info('=========================') # Train model model.train() train_running_loss, train_n_batch = 0, 0 for index, split_idx in enumerate(file_idx_list, start=1): dl = data_loader_v2(wv, y_list, x_list, input_split_path, split_idx, batch_size=batch_size, shuffle=True) it = iter(dl) while True: try: yl, xl, x_seq_len = next(it) y = torch.add(yl[0], yl[1], alpha=10).to(device) x = [i.to(device) for i in xl] + [x_seq_len-1] optimizer.zero_grad() yp = F.softmax(model(*x), dim=1) loss = loss_fn(yp,y) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100) optimizer.step() train_running_loss += loss.item() train_n_batch += 1 except StopIteration: break except Exception as e: if logger: logger.error(e) return del dl, it _ = gc.collect() if logger: logger.info('Epoch {}/{} - File {}/8 Done - Train Loss: {:.6f}, Learning Rate {:.7f}'.format(epoch, task[-1][0], index, train_running_loss/train_n_batch, optimizer.param_groups[0]['lr'])) # Save model & optimizer state dict ck_file_name = '{}_{}_{}.pth'.format(checkpoint_prefix, epoch, split_idx) ck_file_path = os.path.join(checkpoint_dir, ck_file_name) torch.save(model.state_dict(), ck_file_path) op_file_name = '{}_{}_{}_opti.pth'.format(checkpoint_prefix, epoch, split_idx) op_file_path = os.path.join(checkpoint_dir, op_file_name) torch.save(optimizer.state_dict(), op_file_path) torch.cuda.empty_cache() # Evaluate model model.eval() test_running_loss, test_n_batch = 0, 0 true_y, pred_y = [], [] with torch.no_grad(): for split_idx in [9, 10]: dl = data_loader_v2(wv, y_list, x_list, input_split_path, split_idx, batch_size=batch_size, shuffle=True) it = iter(dl) while True: try: yl, xl, x_seq_len = next(it) y = torch.add(yl[0], yl[1], alpha=10).to(device) x = [i.to(device) for i in xl] + [x_seq_len-1] yp = F.softmax(model(*x), dim=1) loss = loss_fn(yp,y) pred_y.extend(list(yp.cpu().detach().numpy())) true_y.extend(list(y.cpu().detach().numpy())) test_running_loss += loss.item() test_n_batch += 1 except StopIteration: break except Exception as e: if logger: logger.error(e) return del dl, it _ = gc.collect() pred = np.argmax(np.array(pred_y), 1) true = np.array(true_y).reshape((-1,)) age_acc = accuracy_score(true%10, pred%10) gen_acc = accuracy_score(true//10, pred//10) del pred, true, pred_y, true_y _ = gc.collect() if logger: logger.info('Epoch {}/{} Done - Test Loss: {:.6f}, Age Accuracy: {:.6f}, Gender Accuracy: {:.6f}, Combined Accuracy: {:.6f}'.format( epoch, task[-1][0], test_running_loss/test_n_batch, age_acc, gen_acc, age_acc+gen_acc)) scheduler.step(test_running_loss/test_n_batch) if logger: logger.info('Epoch {}/{} - Updated Learning Rate: {:.8f}'.format(epoch, task[-1][0], optimizer.param_groups[0]['lr']))
def _garbageCollect(self, task = None): gc.enable() gct = GCTrigger() gc.disable() return Task.cont
# In[ ]: import os import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from skimage.io import imread import matplotlib.pyplot as plt from skimage.segmentation import mark_boundaries from skimage.util.montage import montage2d as montage montage_rgb = lambda x: np.stack( [montage(x[:, :, :, i]) for i in range(x.shape[3])], -1) ship_dir = '../input' train_image_dir = os.path.join(ship_dir, 'train_v2') test_image_dir = os.path.join(ship_dir, 'test_v2') import gc gc.enable() # memory is tight # In[ ]: masks = pd.read_csv( os.path.join('../input/', 'train_ship_segmentations_v2.csv')) print(masks.shape[0], 'masks found') print(masks['ImageId'].value_counts().shape[0]) masks['path'] = masks['ImageId'].map( lambda x: os.path.join(train_image_dir, x)) masks.head() # # Split into training and validation groups # We stratify by the number of boats appearing so we have nice balances in each set # In[ ]:
def buildIndex(self): '''main of the program, creates the index''' gc.enable() self.index = defaultdict(lambda: array('L')) # main index lengths = {} # for calculating and storing document (cosine) lengths for doc in open(wdir + 'documents.list', 'rt'): fname = doc.rstrip() # documents/LN-20020102023.vert path = wdir + fname f = gzip.open(path + '.gz', 'rt') # Parse file into sections and append text # parsedDoc = self.parseDoc(f) # returns a dictionary of parsed xml sections # text = ''.join([v for k, v in parsedDoc.items() if v is not None and k != "docid"]) # docid = parsedDoc["docid"] # if docid[0] == 'L': # docid = '1' + docid[7:] # begins with LN # else: # docid = '2' + docid[7:] # begins with MF # # docid = int(docid) docid, text = self.parseDoc(f) docid = self.truncateDocid(docid) # print("processing doc " + str(docid)) pattern = (r"^[0-9]+\s+" # word number "([a-zěščřžťďňńáéíýóůA-ZĚŠČŘŽŤĎŇŃÁÉÍÝÓŮ]+)[0-9]*\s+" # form "[a-zěščřžťďňńáéíýóůA-ZĚŠČŘŽŤĎŇŃÁÉÍÝÓŮ]+[0-9]*[-_]?.*\s+" # lemma "[A-ZĚŠČŘŽŤĎŇŃÁÉÍÝÓŮ0-9-=]+\s+" "[a-zěščřžťďňńáéíýóůA-ZĚŠČŘŽŤĎŇŃÁÉÍÝÓŮ]+$") tokens = re.findall(pattern, text, re.MULTILINE) counts = Counter(tokens) # print(counts) length = 0 for token, cnt in counts.items(): idPlusTf = self.combineInts(docid, cnt) length += cnt * cnt # add sqrd components # if token not in self.index: self.index[token].append(idPlusTf) # append a new entry and postings list #self.lexicon[token].append(token) lengths[docid] = math.sqrt(length) # sqrt # else: # self.index[token].append(idPlusTf) # if docid not in postings: # postings.append(idPlusTf) # del postings del tokens del counts gc.collect() self.writeIndex() self.writeOffsets() # # length = 0 # for token, cnt in counts.items(): # length += cnt * cnt # lengths[docid] = math.sqrt(length) # del tokens # del counts # gc.collect() # with gzip.open(wdir + '/output/lengths.gz', 'wt') as f: print("writing doc length") for docid, length in lengths.items(): # print(self.expandDocid(docid) + '\t' + str(length) + '\n') f.write(self.expandDocid(docid) + '\t' + str(length) + '\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--only_augmenters", type=str, help="Names of augmenters to measure, regexes, delimiter is ','.") parser.add_argument("--nosave", action="store_true", help="Whether not to save any results") args = parser.parse_args() if args.only_augmenters is not None: args.only_augmenters = [ name.strip() for name in args.only_augmenters.split(",") ] args.save = (args.nosave is not True) if not args.save: print("[NOTE] will not save data") iterations = 100 batch_sizes = [1, 128] backgrounds = [False] print("---------------------------") print("Images") print("---------------------------") results_images = [] base_image = skimage.data.astronaut() images = [ ia.imresize_single_image(base_image, (64, 64)), ia.imresize_single_image(base_image, (224, 224)) ] for image in images: print("") print("image size: %s" % (image.shape, )) augmenters = create_augmenters(height=image.shape[0], width=image.shape[1], height_augmentable=image.shape[0], width_augmentable=image.shape[1], only_augmenters=args.only_augmenters) for batch_size in batch_sizes: if batch_size != batch_sizes[0]: print("") print("batch_size: %d" % (batch_size, )) for background in backgrounds: for augmenter in augmenters: images_batch = np.uint8([image] * batch_size) ia.seed(1) times = [] gc.disable() # as done in timeit if not background: for _ in sm.xrange(iterations): time_start = time.time() _img_aug = augmenter.augment_images(images_batch) time_end = time.time() times.append(time_end - time_start) else: batches = [ ia.Batch(images=images_batch) for _ in sm.xrange(iterations) ] for _ in sm.xrange(iterations): time_start = time.time() gen = augmenter.augment_batches(batches, background=True) for _batch_aug in gen: pass time_end = time.time() times.append(time_end - time_start) gc.enable() results_images.append({ "augmentable": "images", "background": background, "image.shape": image.shape, "batch_size": batch_size, "augmenter.name": augmenter.name, "times": times }) items_per_sec = (1 / np.average(times)) * batch_size mbit_per_img = (image.size * image.dtype.itemsize * 8) / 1024 / 1024 mbit_per_sec = items_per_sec * mbit_per_img print("IMG | HxW=%s B=%d %s " "| SUM %10.5fs " "| ITER avg %10.5fs, min %10.5fs, max %10.5fs " "| img/s %11.3f " "| mbit/s %9.3f, mbyte/s %9.3f " "| %s" % (image.shape[0:2], batch_size, "BG" if background else "FG", float(np.sum(times)), np.average(times), np.min(times), np.max(times), items_per_sec, mbit_per_sec, mbit_per_sec / 8, augmenter.name)) if args.save: current_dir = os.path.dirname(__file__) target_dir = os.path.join(current_dir, "measure_performance_results") if not os.path.exists(target_dir): os.makedirs(target_dir) with open(os.path.join(target_dir, "results_images.pickle"), "wb") as f: pickle.dump(results_images, f, protocol=-1) print("---------------------------") print("Heatmaps") print("---------------------------") results_heatmaps = [] for nb_heatmaps in [1, 5]: # per image base_image = skimage.data.astronaut() images = [ ia.imresize_single_image(base_image, (64, 64)), ia.imresize_single_image(base_image, (224, 224)) ] heatmaps = [ np.tile(heatmap[..., 0:1], (1, 1, nb_heatmaps)) for heatmap in iaa.Grayscale(1.0).augment_images(images) ] heatmaps_ois = [ ia.HeatmapsOnImage(heatmap.astype(np.float32) / 255.0, shape=(224, 224, 3)) for heatmap in heatmaps ] for heatmaps_oi in heatmaps_ois: print("") print("heatmap size: %s (on image: %s)" % ( heatmaps_oi.arr_0to1.shape, heatmaps_oi.shape, )) augmenters = create_augmenters( height=heatmaps_oi.shape[0], width=heatmaps_oi.shape[1], height_augmentable=heatmaps_oi.arr_0to1.shape[0], width_augmentable=heatmaps_oi.arr_0to1.shape[1], only_augmenters=args.only_augmenters) for batch_size in batch_sizes: if batch_size != batch_sizes[0]: print("") print("batch_size: %d" % (batch_size, )) for background in backgrounds: for augmenter in augmenters: heatmaps_oi_batch = [heatmaps_oi] * batch_size ia.seed(1) times = [] gc.disable() # as done in timeit if not background: for _ in sm.xrange(iterations): time_start = time.time() _hms_aug = augmenter.augment_heatmaps( heatmaps_oi_batch) time_end = time.time() times.append(time_end - time_start) gc.collect() else: batches = [ ia.Batch(heatmaps=heatmaps_oi_batch) for _ in sm.xrange(iterations) ] for _ in sm.xrange(iterations): time_start = time.time() gen = augmenter.augment_batches( batches, background=True) for _batch_aug in gen: pass time_end = time.time() times.append(time_end - time_start) gc.collect() gc.disable() results_heatmaps.append({ "augmentable": "heatmaps", "background": background, "nb_heatmaps": nb_heatmaps, "heatmaps_oi.arr_0to1.shape": heatmaps_oi.arr_0to1.shape, "heatmaps_oi.shape": heatmaps_oi.shape, "batch_size": batch_size, "augmenter.name": augmenter.name, "times": times }) h, w, c = heatmaps_oi.arr_0to1.shape items_per_sec = (1 / np.average(times)) * batch_size * c mbit_per_img = (h * w * heatmaps_oi.arr_0to1.dtype.itemsize * 8) / 1024 / 1024 mbit_per_sec = items_per_sec * mbit_per_img print("HMs | HxWxN=%s (on %s) B=%d %s " "| SUM %10.5fs " "| ITER avg %10.5fs, min %10.5fs, max %10.5fs " "| hms/s %11.3f " "| mbit/s %9.3f, mbyte/s %9.3f " "| %s" % (heatmaps_oi.arr_0to1.shape[0:3], heatmaps_oi.shape[0:2], batch_size, "BG" if background else "FG", float(np.sum(times)), np.average(times), np.min(times), np.max(times), items_per_sec, mbit_per_sec, mbit_per_sec / 8, augmenter.name)) if args.save: current_dir = os.path.dirname(__file__) target_dir = os.path.join(current_dir, "measure_performance_results") if not os.path.exists(target_dir): os.makedirs(target_dir) with open(os.path.join(target_dir, "results_heatmaps.pickle"), "wb") as f: pickle.dump(results_heatmaps, f, protocol=-1) print("---------------------------") print("Keypoints") print("---------------------------") results_keypoints = [] for nb_points in [1, 10]: # per image base_image = skimage.data.astronaut() h, w = base_image.shape[0:2] if nb_points == 1: keypoints = [ ia.Keypoint(x=x * w, y=y * h) for y, x in [(0.4, 0.4)] ] else: keypoints = [ ia.Keypoint(x=x * w, y=y * h) for y, x in [(0.2, 0.2), (0.3, 0.3), (0.4, 0.4), ( 0.6, 0.6), (0.7, 0.7), (0.8, 0.8), (0.5, 0.25), (0.5, 0.75), (0.25, 0.5), (0.75, 0.5)] ] base_image_kpoi = ia.KeypointsOnImage(keypoints, shape=(224, 224, 3)) images = [ ia.imresize_single_image(base_image, (64, 64)), ia.imresize_single_image(base_image, (224, 224)) ] keypoints_on_images = [ base_image_kpoi.on(image.shape) for image in images ] for keypoints_on_image in keypoints_on_images: print("") print("#points: %d (on image: %s)" % ( len(keypoints_on_image.keypoints), keypoints_on_image.shape, )) augmenters = create_augmenters( height=keypoints_on_image.shape[0], width=keypoints_on_image.shape[1], height_augmentable=keypoints_on_image.shape[0], width_augmentable=keypoints_on_image.shape[1], only_augmenters=args.only_augmenters) for batch_size in batch_sizes: if batch_size != batch_sizes[0]: print("") print("batch_size: %d" % (batch_size, )) for background in backgrounds: for augmenter in augmenters: keypoints_on_image_batch = [keypoints_on_image ] * batch_size ia.seed(1) times = [] gc.disable() # as done in timeit if not background: for _ in sm.xrange(iterations): time_start = time.time() _kps_aug = augmenter.augment_keypoints( keypoints_on_image_batch) time_end = time.time() times.append(time_end - time_start) gc.collect() else: batches = [ ia.Batch(keypoints=keypoints_on_image_batch) for _ in sm.xrange(iterations) ] for _ in sm.xrange(iterations): time_start = time.time() gen = augmenter.augment_batches( batches, background=True) for _batch_aug in gen: pass time_end = time.time() times.append(time_end - time_start) gc.enable() results_keypoints.append({ "augmentable": "keypoints", "background": background, "nb_points": len(keypoints_on_image.keypoints), "keypoints_on_image.shape": keypoints_on_image.shape, "batch_size": batch_size, "augmenter.name": augmenter.name, "times": times }) items_per_sec = (1 / np.average(times)) * batch_size * len( keypoints_on_image.keypoints) mbit_per_img = (len(keypoints_on_image.keypoints) * 2 * 32) / 1024 / 1024 mbit_per_sec = items_per_sec * mbit_per_img print("KPs | #points=%d (on %s) B=%d %s " "| SUM %10.5fs " "| ITER avg %10.5fs, min %10.5fs, max %10.5fs " "| kps/s %11.3f " "| mbit/s %9.3f, mbyte/s %9.3f " "| %s" % (len(keypoints_on_image.keypoints), keypoints_on_image.shape[0:2], batch_size, "BG" if background else "FG", float(np.sum(times)), np.average(times), np.min(times), np.max(times), items_per_sec, mbit_per_sec, mbit_per_sec / 8, augmenter.name)) if args.save: current_dir = os.path.dirname(__file__) target_dir = os.path.join(current_dir, "measure_performance_results") if not os.path.exists(target_dir): os.makedirs(target_dir) with open(os.path.join(target_dir, "results_keypoints.pickle"), "wb") as f: pickle.dump(results_keypoints, f, protocol=-1)
def __execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite): """ Executes the program using posix_spawn(). This is based on the method from the superclass but the posix_spawn API forces a number of changes. In particular: * When using fork() FDs are manipulated in the child process after the fork, but before the program is exec()ed. With posix_spawn() this is done by passing a data-structure to the posix_spawn() call, which describes the FD manipulations to perform. * The fork() version waits until after the fork before unsetting the non-blocking flag on the FDs that the child has inherited. In the posix_spawn() version, we cannot do that after the fork so we dup the FDs in advance and unset the flag on the duped FD, which we then pass to the child. """ if preexec_fn is not None: raise NotImplementedError("preexec_fn not supported") if close_fds: raise NotImplementedError("close_fds not implemented") if cwd: raise NotImplementedError( "cwd not implemented") # pragma: no cover if universal_newlines: raise NotImplementedError() # pragma: no cover assert startupinfo is None and creationflags == 0 _log.debug("Pipes: p2c %s, %s; c2p %s, %s; err %s, %s", p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite) if isinstance(args, types.StringTypes): args = [args] else: args = [a.encode("ascii") for a in args] if shell: args = ["/bin/sh", "-c"] + args if executable: args[0] = executable if executable is None: executable = args[0] self._loop.install_sigchld() # The FileActions object is an ordered list of FD operations for # posix_spawn to do in the child process before it execs the new # program. file_actions = FileActions() # In the child, close parent's pipe ends. if p2cwrite is not None: file_actions.add_close(p2cwrite) if c2pread is not None: file_actions.add_close(c2pread) if errread is not None: file_actions.add_close(errread) # When duping fds, if there arises a situation where one of the fds # is either 0, 1 or 2, it is possible that it is overwritten (#12607). fds_to_close_in_parent = [] if c2pwrite == 0: c2pwrite = os.dup(c2pwrite) fds_to_close_in_parent.append(c2pwrite) if errwrite == 0 or errwrite == 1: errwrite = os.dup(errwrite) fds_to_close_in_parent.append(errwrite) # Dup stdin/out/err FDs in child. def _dup2(dup_from, dup_to): if dup_from is None: # Pass through the existing FD. dup_from = dup_to # Need to take a dup so we can remove the non-blocking flag a_dup = os.dup(dup_from) _log.debug("Duped %s as %s", dup_from, a_dup) fds_to_close_in_parent.append(a_dup) self._remove_nonblock_flag(a_dup) file_actions.add_dup2(a_dup, dup_to) _dup2(p2cread, 0) _dup2(c2pwrite, 1) _dup2(errwrite, 2) # Close pipe fds in the child. Make sure we don't close the same fd # more than once, or standard fds. for fd in set([p2cread, c2pwrite, errwrite]): if fd > 2: file_actions.add_close(fd) gc_was_enabled = gc.isenabled() # FIXME Does this bug apply to posix_spawn version? try: # Disable gc to avoid bug where gc -> file_dealloc -> # write to stderr -> hang. http://bugs.python.org/issue1336 gc.disable() self.pid = posix_spawnp( executable, args, file_actions=file_actions, env=env, ) except: if gc_was_enabled: gc.enable() raise finally: for fd in fds_to_close_in_parent: os.close(fd) # Capture the SIGCHILD. self._watcher = self._loop.child(self.pid) self._watcher.start(self._on_child, self._watcher) if gc_was_enabled: gc.enable() # Close the Child's pipe ends in the parent. if p2cread is not None and p2cwrite is not None: os.close(p2cread) if c2pwrite is not None and c2pread is not None: os.close(c2pwrite) if errwrite is not None and errread is not None: os.close(errwrite)
def sort_columns(input_filename='-', output_filename='-', columns=None, header=False, ignore_case=False, unique=False, tmp_dir=None, buffer_size='80%', parallel=multiprocessing.cpu_count(), compress_program=None): """ It sorts the input file (text tab separated file) based on the specified columns. It works like SELECT * ORDER BY in SQL. """ import locale locale.setlocale(locale.LC_ALL, 'C') # check options suppported by SORT command sh1 = give_me_temp_filename(tmp_dir) sh2 = give_me_temp_filename(tmp_dir) # check options suppported by SORT command sort_parallel = False r = os.system("sort --help | grep 'parallel' > '%s'" % (sh1, )) if (not r) and (not empty(sh1)) and len(file(sh1, 'r').readlines()) == 1: sort_parallel = True delete_file(sh1) # check options suppported by SORT command sort_buffer = False r = os.system("sort --help | grep 'buffer-size' > '%s'" % (sh1, )) if (not r) and (not empty(sh1)) and len(file(sh1, 'r').readlines()) == 1: sort_buffer = True delete_file(sh1) # check options suppported by SORT command sort_compress = False if compress_program: r = os.system( "sort --help | grep 'compress-program' > '%s' ; %s --help 2>/dev/null | grep -i 'compress' > '%s'" % (sh1, compress_program, sh2)) if (not r) and ( (not empty(sh1)) and len(file(sh1, 'r').readlines()) == 1 and (not empty(sh2)) and len(file(sh2, 'r').readlines()) >= 1): sort_compress = True delete_file(sh1) delete_file(sh2) # treat the case when the input file is coming from the standard input fin = input_filename.strip('"').strip("'") if fin == '-': fin = give_me_temp_filename(tmp_dir) fod = open(fin, 'w') fid = sys.stdin while True: lines = fid.readlines(10**8) if not lines: break fod.writelines(lines) fod.close() fon = output_filename.strip('"').strip("'") if fon == '-': fon = give_me_temp_filename(tmp_dir) if header: header_saved = file(fin, 'r').readline() file(output_filename, 'w').write(header_saved) else: file(output_filename, 'w').write('') # process the type of the column, numeric, or string first_line = file(fin, 'r').readline() if first_line: nc = len(file(fin, 'r').readline().rstrip('\r\n').split( '\t')) #read first line in order to find out the number of columns if columns: columns = columns.strip().lower() if columns == 'd': columns = ','.join([str(i + 1) + 'd' for i in range(nc)]) elif columns == 'n': columns = ','.join([str(i + 1) + 'n' for i in range(nc)]) elif columns == 'nd' or columns == 'dn': columns = ','.join([str(i + 1) + 'nd' for i in range(nc)]) else: columns = ','.join([str(i + 1) for i in range(nc)]) # extra parameters extra = "" if sort_buffer and buffer_size and buffer_size != 'no' and buffer_size != 'none': extra = extra + ' --buffer-size=' + str(buffer_size) + ' ' if sort_parallel and parallel and parallel > 1: extra = extra + ' --parallel=' + str(parallel) + ' ' if sort_compress and compress_program and compress_program.lower( ) != 'no' and compress_program.lower() != 'none': extra = extra + ' --compress-program=' + compress_program + ' ' # processing the input columns columns = [ '-k ' + el + ',' + el.replace('n', '').replace('r', '') for el in columns.replace('d', 'r').split(',') ] comd = "-s -t '\t' " + " ".join(columns) if ignore_case: comd = "-f " + comd if unique: comd = "-u " + comd if tmp_dir: comd = "-T '" + tmp_dir + "' " + comd if header: comd = "LC_ALL=C sed 1d '" + fin + "' | LC_ALL=C sort " + extra + comd + " >> '" + output_filename + "'" else: comd = "LC_ALL=C sort " + extra + comd + " '" + fin + "' >> '" + output_filename + "'" r = os.system(comd) if r != 0: print >> sys.stderr, "ERROR (sort_ttdb.py) while running:" print >> sys.stderr, comd sys.exit(1) if input_filename == '-': os.remove(fin) if output_filename == '-': fod = sys.stdout fid = open(fon, 'r') while True: gc.disable() lines = fid.readlines(10**8) gc.enable() if not lines: break fod.writelines(lines) fid.close() os.remove(fon)
# # 1. Import and Reshape Data # First we load the necessary Python packages and then we import the CSV files that were provided by Instacart. # # ## 1.1 Import the required packages # The garbage collector (package gc), attempts to reclaim garbage, or memory occupied by objects (e.g., DataFrames) that are no longer in use by Python ([ref1](https://www.techopedia.com/definition/1083/garbage-collection-gc-general-programming), [ref2](https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)). This package will eliminate our risk to exceed the 16GB threshold of available RAM that Kaggle offers. # # The **"as"** reserved word is to define an alias to the package. The alias help us to call easier a package in our code. # In[2]: # For data manipulation import pandas as pd # Garbage Collector to free up memory import gc gc.enable() # Activate # ## 1.2 Load data from the CSV files # Instacart provides 6 CSV files, which we have to load into Python. Towards this end, we use the .read_csv() function, which is included in the Pandas package. Reading in data with the .read_csv( ) function returns a DataFrame. # In[3]: orders = pd.read_csv('orders.csv') order_products_train = pd.read_csv('order_products__train.csv') order_products_prior = pd.read_csv('order_products__prior.csv') products = pd.read_csv('products.csv') aisles = pd.read_csv('aisles.csv') departments = pd.read_csv('departments.csv') # This step results in the following DataFrames: # * <b>orders</b>: This table includes all orders, namely prior, train, and test. It has single primary key (<b>order_id</b>).
def _fork(self, path, uid, gid, executable, args, environment, **kwargs): """ Fork and then exec sub-process. @param path: the path where to run the new process. @type path: L{bytes} or L{unicode} @param uid: if defined, the uid used to run the new process. @type uid: L{int} @param gid: if defined, the gid used to run the new process. @type gid: L{int} @param executable: the executable to run in a new process. @type executable: L{str} @param args: arguments used to create the new process. @type args: L{list}. @param environment: environment used for the new process. @type environment: L{dict}. @param kwargs: keyword arguments to L{_setupChild} method. """ collectorEnabled = gc.isenabled() gc.disable() try: self.pid = os.fork() except: # Still in the parent process if collectorEnabled: gc.enable() raise else: if self.pid == 0: # A return value of 0 from fork() indicates that we are now # executing in the child process. # Do not put *ANY* code outside the try block. The child # process must either exec or _exit. If it gets outside this # block (due to an exception that is not handled here, but # which might be handled higher up), there will be two copies # of the parent running in parallel, doing all kinds of damage. # After each change to this code, review it to make sure there # are no exit paths. try: # Stop debugging. If I am, I don't care anymore. sys.settrace(None) self._setupChild(**kwargs) self._execChild(path, uid, gid, executable, args, environment) except: # If there are errors, try to write something descriptive # to stderr before exiting. # The parent's stderr isn't *necessarily* fd 2 anymore, or # even still available; however, even libc assumes that # write(2, err) is a useful thing to attempt. try: stderr = os.fdopen(2, 'wb') msg = ("Upon execvpe {0} {1} in environment id {2}" "\n:").format(executable, str(args), id(environment)) if _PY3: # On Python 3, print_exc takes a text stream, but # on Python 2 it still takes a byte stream. So on # Python 3 we will wrap up the byte stream returned # by os.fdopen using TextIOWrapper. # We hard-code UTF-8 as the encoding here, rather # than looking at something like # getfilesystemencoding() or sys.stderr.encoding, # because we want an encoding that will be able to # encode the full range of code points. We are # (most likely) talking to the parent process on # the other end of this pipe and not the filesystem # or the original sys.stderr, so there's no point # in trying to match the encoding of one of those # objects. stderr = io.TextIOWrapper(stderr, encoding="utf-8") stderr.write(msg) traceback.print_exc(file=stderr) stderr.flush() for fd in xrange(3): os.close(fd) except: # Handle all errors during the error-reporting process # silently to ensure that the child terminates. pass # See comment above about making sure that we reach this line # of code. os._exit(1) # we are now in parent process if collectorEnabled: gc.enable() self.status = -1 # this records the exit status of the child