def rate_key_size(key_size, ciphertext): dist = 0 for block_1, block_2 in zip(chunks(ciphertext, key_size), chunks(ciphertext, key_size)[1:]): dist += hamming_distance(block_1, block_2) dist /= len(ciphertext) / key_size normalized = dist / key_size return normalized
def patient_report(request, patient_key): try: patient = Patient.objects.get(key=patient_key) except: raise Http404 chads2_relative = { 'sad': round(patient.chads2_risk()['percentage']), 'happy': 100 - round(patient.chads2_risk()['percentage']) } faces = ('S' * chads2_relative['sad']) + ('H' * chads2_relative['happy']) chads2_relative['rows'] = utils.chunks(faces, 10) hasbled_relative = { 'sad': round(patient.hasbled_risk()['percentage']), 'happy': 100 - round(patient.hasbled_risk()['percentage']) } faces = ('S' * hasbled_relative['sad']) + ('H' * hasbled_relative['happy']) hasbled_relative['rows'] = utils.chunks(faces, 10) context = { 'patient': patient, 'chads2_relative': chads2_relative, 'hasbled_relative': hasbled_relative, } return render(request, 'data_entry/patients/reports/patient.html', context)
def test_forest_classifiers(self): """ Confirm the basic accuracy of our classifiers. """ #http://scikit-learn.org/stable/datasets/ n_estimators=100 # The number of parts the dataset will be split into. parts = 10 datasets = [ ('Iris', load_iris()), ('Digits', load_digits()), ] classifiers = [ (AdaBoostClassifier, partial(AdaBoostClassifier, n_estimators=n_estimators)), (ExtraTreesClassifier, partial(ExtraTreesClassifier, n_estimators=n_estimators)), # (DecisionTreeClassifier, DecisionTreeClassifier), (StreamingDecisionTreeClassifier, partial(StreamingDecisionTreeClassifier, n_estimators=n_estimators)), # (RandomForestClassifier, partial(RandomForestClassifier, n_estimators=n_estimators)), (StreamingRandomForestClassifier, partial(StreamingRandomForestClassifier, n_estimators=n_estimators)), # (ExtraTreesClassifier, partial(ExtraTreesClassifier, n_estimators=n_estimators)), (StreamingExtraTreesClassifier, partial(StreamingExtraTreesClassifier, n_estimators=n_estimators)), ] for name, dataset in datasets: print('\nDataset\t%s' % name, len(dataset.data)) # Split our dataset into evenly-sized parts, simulating having # to train our classifiers out-of-core on massive datasets. # Note, the reference classifiers that don't support partial_fit() # will only be trained on each individual chunk. parts_n = len(dataset.data)/parts data_chunks = list(utils.chunks(dataset.data, parts_n)) target_chunks = list(utils.chunks(dataset.target, parts_n)) print('Score\tClassifier') for cls, cls_callable in classifiers: random.seed(0) clf = cls_callable() for data, target in zip(data_chunks, target_chunks): # print(data) # print(target) assert len(data) == len(target) if hasattr(clf, 'partial_fit'): clf.partial_fit(data, target) else: clf.fit(data, target) #scores = cross_val_score(clf, data, target) #score = scores.mean() score = clf.score(dataset.data, dataset.target) print('%.04f\t%s' % (score, cls.__name__))
def copy_csvfile_to_table(f, table_name, delimiter, output_stream, db_params): with DBConnection(db_params) as conn: cur = conn.cursor() # cur.copy_from is shitty() (quotes, error messages, ...) # so we have to reinvent the wheel # UGLY UGLY UGLY (but works) input_file = csv.DictReader(f, delimiter=delimiter) processed_rows_counter = 0 fields_list = [] all_values = [] for row in input_file: values = () for k, v in row.iteritems(): if processed_rows_counter == 0: # Only needed once! fields_list.append(k) values = values + (v,) all_values.append(values) processed_rows_counter += 1 s = _get_insert_string(table_name, fields_list) for vals in chunks(all_values, 500): cur.executemany(s, vals) #if cur.rowcount != 1: # output_stream.write("ERROR: rowcount is {rowcount} for {query}\n".format(rowcount=cur.rowcount, query=s)) conn.commit() output_stream.write("{i} processed rows ".format(i=processed_rows_counter))
def new_scrobble(self, **kwargs): """ keyword arguments are the exact same as Scrobble.__init__ It makes the new scrobble object, then tries to send all scrobbles that need sending. """ kwargs['session'] = self kwargs['sent'] = False new_scrobble = Scrobble(**kwargs) scrobbles = self.get_failed_scrobbles() try: for ss in chunks(scrobbles, chunksize=50): # Send previously failed scrobbles to lastfm in chunks of 50 ss = ScrobbleSet(scrobbles) ss.try_to_send() except LastFMError: # there were old scrobbles that needed to be sent first, and some of them # failed. Queue this one up, and don't bother sending any more. # (lastfm is most likely down) new_scrobble.send = False new_scrobble.save() return False else: # Either there were no old scrobbles to send first, or they # all were sent sucessfully! send the new scrobble now. new_scrobble.timestamp = int(time.time()) new_scrobble.send() return True
def get_attributes(args): """ Gather all data necessary for metrics calculations """ # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE)) # Now gather all usage data numbers from the MongoDB 'adsdata' collection, # keyed on bibcode ads_data = get_mongo_data(bibcodes=bibcodes) missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes) app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) # Get precomputed and citation data metrics_data = get_metrics_data(bibcodes=bibcodes) # Get the number of citing papers Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values()))))) Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values()))))) # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,ads_data,metrics_data) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
def play_sound(fname): global cfg_dict cfg_dict = utils.read_config() audio = pyaudio.PyAudio() stream_audio = audio.open(format=pyaudio.paInt16, channels=2, rate=cfg_dict['rate'], output=True, frames_per_buffer=cfg_dict['rate']) f = open(fname) data = list(utils.chunks(f.read(), cfg_dict['rate'])) f.close() print ' * play' start = time.time() for i in range(len(data)): stream_audio.write(data[i]) print ' * play end |', stream_audio.stop_stream() stream_audio.close() audio.terminate() print 'elapsed time =', time.time() - start
def _decodePeersInfo(self, str): """ Decode all peers. """ peers = utils.chunks(str, 7) peers = map(self._decodePeerInfo, peers) return list(peers)
def insert_many(table_name, fields_list, all_values, db_params): with DBConnection(db_params) as conn: cur = conn.cursor() s = _get_insert_string(table_name, fields_list) for vals in chunks(all_values, 500): cur.executemany(s, vals) conn.commit()
def stop(self): # Handle the saved bits for chunk in chunks(self.extra_bits, 8): if len(chunk) == 8: byte = sum([ x << i for i, x in enumerate(chunk) ]) for x in self.chain: x.put(byte) # Continue return super(VonNeumannExtractor, self).stop()
def _populate_plankton(self): print 'Populating Planktons...' portions = utils.chunks(range(self.wally.shape[0]), self.wally.shape[0]/len(self.qm.keys())) for p in self.qm.keys(): r = portions.next() self.qm[p]['range'] = (r[0], r[-1]) self.qm[p]['q_in'].put([self.capacity, self.mutation,r[0], self.wally[r[0]:r[-1]+1], self.meteo[:,r[0]:r[-1]+1], self.nada]) print '\tPLANKTON QM - %i \trange: %i - %i' % (p, r[0], r[-1])
def admin_clearAllBuildData(req): all = getAllFromFromQuery(datamodel.DB_FileBuild.all(keys_only=True)) for x in chunks(all, 1000): db.delete(x) nextVersionNum = datamodel.DB_JV_AtomicCounter.GetNextCounter(_fileVerKey) from inmemconfig import InAppMemConfig InAppMemConfig.ForceVersionIncrement() return RetType.JSONSUCCESS
def correct(self, word): global WORD WORD = word p = Pool(self.n_jobs) chunk_size = int(len(self.forms) / self.n_jobs) # form, max_prob = get_most_probable_from_chunk(self.forms) arguments = chunks(self.forms, chunk_size) results = p.map(get_most_probable_from_chunk, arguments) p.close() p.join() form, max_prob = max(results, key=operator.itemgetter(1)) return form
def put(self, filename): file_uuid = uuid4().hex file = File(name=str(urllib.unquote(filename)), content_type=self.request.headers.get('Content-Type', None), key_name=file_uuid) file.put() for chunk in chunks(self.request.body, config.max_fragment_size): Fragment(file=file, data=chunk).put() self.response.set_status(201) self.response.out.write(file_uuid)
def dump(self): result = "" result += "Start: " + self.exploit.pointer_format % self.start + " (" + self.exploit.closest_section_from_address(self.start) + ")\n" result += "Size: " + self.exploit.pointer_format % self.size + " (" + str(self.size) + ")\n" result += "End: " + self.exploit.pointer_format % self.end + "\n" result += "Base: " + self.exploit.pointer_format % self.align_to + "\n" result += "Alignment: " + str(self.alignment) + "\n" result += "Index: " + hex(self.index) + " (" + str(self.index) + ")\n" result += "Wasted: " + str(self.wasted) + "\n" result += "Content:\n" for chunk in chunks(self.content, self.exploit.pointer_size): result += " " * 4 + " ".join(["%.2x" % ord(c) for c in chunk]) + " " + (self.exploit.pointer_format % self.exploit.str2ptr(chunk) if len(chunk) == self.exploit.pointer_size else "") + "\n" return result
def sites_update(): master_list = "http://www.metoffice.gov.uk/public/data/PWSCache/Locations/MasterList?format=application/json" # master_list = "http://localhost/~gareth/MasterList.json" result = urlfetch.fetch(master_list) if result.status_code == 200: obs_sites = filter(lambda loc: loc["type"] == "Observing Site", parse_locations(result.content)) for chunk in chunks(obs_sites,10): taskqueue.add(url="/admin/sites/store", params = {"sites":json.dumps(chunk)}) flash("Started load of %d sites" % len(obs_sites)) else: flash("Error fetching MasterList: [%d] - %s" % (result.status_code, result.status_message)) return redirect(url_for('index'))
def save_states(q, gpu, target, limit, mem_ratio, model_dir, seed=0, chunksize=1000): os.environ['CUDA_VISIBLE_DEVICES'] = gpu print 'GPU {}'.format(gpu) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_ratio) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: load_graph(os.path.join(model_dir, 'classify_image_graph_def.pb')) next_last_layer = sess.graph.get_tensor_by_name('pool_3:0') while True: source = q.get() if source == KILL: break images = glob.glob('{}/*'.format(source)) random.seed(seed) random.shuffle(images) if limit > 0: images = images[:limit] t0 = time.time() h5name = os.path.join(target, '{}.h5'.format(os.path.basename(os.path.normpath(source)))) with pd.HDFStore(h5name, mode='w', complevel=9, complib='blosc') as store: for chunk in chunks(images, chunksize): states = [] for jpg in list(chunk): # Creates a copy over which it is safe to iterate try: raw_data = gfile.FastGFile(jpg).read() hidden_layer = sess.run(next_last_layer, {'DecodeJpeg/contents:0': raw_data}) hidden_layer = np.squeeze(hidden_layer) states.append(hidden_layer) except Exception as e: chunk.remove(jpg) print 'Something went wrong when processing {}'.format(jpg) X = np.vstack(states) columns = [ 'f{}'.format(i) for i in range(X.shape[1]) ] df = pd.DataFrame(data=X, index=chunk, columns=columns) df.index.name='filename' store.append('data', df) print('Time spent collecting {} states: {}'.format(len(images), time.time() - t0))
def imported(self, date_str, **kwargs): start_id = kwargs.get('start', 1) end_id = kwargs.get('end', -1) ids = self.database_service.load_ids(self.market, start_id, end_id) for batch_app_ids in chunks(ids, DEFAULT_BATCH_SIZE): print 'Started to import batch:', len(batch_app_ids) logger.info('Started to import batch: {}'.format(len(batch_app_ids))) for app_id in batch_app_ids: content = self._load(date_str, app_id) detail_dict = self._parser(content) self._save(app_id, detail_dict) garbage_number = gc.collect() print 'Garbage number:', garbage_number self.database_service.close()
def __init__(self): self.cells = [] for _ in xrange(4): self.cells.append(Cell()) self.foundations = [] for _ in xrange(4): self.foundations.append(Foundation()) self.cascades = [] for _ in xrange(8): self.cascades.append(Cascade()) # TODO fill cascades with cards deck = Deck() deck.shuffle() for chunk in chunks(deck, 8): for cascade, card in izip(self.cascades, chunk): cascade.append(card)
def write_flag_export(key_list): connection = connect_db(ORACONN) cursor = connection.cursor() date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: for k in list(chunks(key_list, 1000)): cursor.execute( "UPDATE EXP_LOG_KEYS_UNIFARM SET FLAG_EXPORT=0, UPDATE_DATE_TIME='%s' WHERE SYNC_KEY IN %s" % ( date, k)) logger.info('___Update flag for synckeys {0} ok!!!___'.format(k)) connection.commit() ret_val = True except(Exception, cx_Oracle.DatabaseError): logger.error("___Something was wrong during the write on the DB___") ret_val = False cursor.close() connection.close() return ret_val
def process_elm_simple(options): """Determines (and writes) the ELM dictionary""" c_arg = '' if options.process_elm_simple.get('picloud', False): c_arg = '-c' for genome in ('H_sapiens', 'Gallus_gallus'): ofile = os.path.join('working', 'Jul22', 'elmdict_'+genome+'.simple') ifile = os.path.join('working', 'Jul22', genome+'.fa') st_elm_file = os.path.join('working', 'Jul22', 'simple_patterns') elms = {} with open(st_elm_file) as f: for line in f: elm, pattern = line.strip().split('\t') elms[elm] = pattern elm_files = [] size = 1000 if len(elms) > size: counter = 0 for chunk in utils.chunks(elms.keys(), size): new_elm_file = 'working/elm_tmp_file' + str(counter) elm_files.append(new_elm_file) with open(new_elm_file, 'w') as f: for elm in chunk: f.write(elm + '\t' + elms[elm] + '\n') counter += 1 else: elm_files.append(st_elm_file) if not os.path.exists(ofile) or options.process_elm_simple.get('forcenew', False): counter = 0 for elmfile in elm_files: #only do if missing or FORCING sh('python makeELMdict.py %(c)s -o %(out)s %(infile)s %(elm)s' % {'out':ofile, 'c':c_arg, 'infile': ifile, 'elm': elmfile}) sh('mv ' + ofile + ' ' + ofile + str(counter)) counter += 1
def encode(self, s, block=None): """ Encode a message. Parameters ---------- s : str A message to encode. block : int, optional Divide output into blocks of this size. All non-transcodable symbols will be stripped. Specify the value `0` to strip all non-transcodable symbols and not divide into blocks. Specify the value `None` to disable chunking. Default `None`. Returns ------- out : str The encoded message. Notes ----- Although this can invoke either `self._encode` or `super().encode`, it essentially falls prey to the "call super" antipattern and should probably be refactored. [TODO] """ if block is not None: # filter message to characters in ciphertext alphabet s = intersect(s, self.alphabet) if block > 0: padding = upward_factor(block, len(s)) s = s.ljust(padding, self.DEFAULT_NULLCHAR) out = super().encode(s) if block is not None and block > 0: out = ' '.join(chunks(out, block)) return ''.join(out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("indir", help="path to directory of input files", type=str) parser.add_argument("outdir", help="path to directory of database files", type=str) parser.add_argument("-t", "--title", help="the base of the title for the blastdb", type=str, default="blastdb") parser.add_argument("-p", "--partitions", help="number of files to partition database into", type=int, default=1) parser.add_argument("--minpar", help="use the partition size", type=bool) args = parser.parse_args() blastpath = None if 'BLASTPATH' in os.environ: blastpath = os.environ['BLASTPATH'] if not validdir(blastpath, "Invalid $BLASTPATH"): return 1 if not validdir(args.indir, "Input directory does not exist"): return 1 infiles = glob.glob(os.path.join(args.indir, INFORMAT)) if len(infiles) == 0: sys.stderr.write("No valid input files") return 1 num_partitions = args.partitions if args.minpar: num_partitions = len(infiles) if not os.path.exists(args.outdir): os.makedirs(args.outdir) processes = [] partitions = list(chunks(infiles, len(infiles)/num_partitions)) for i, partition in zip(range(len(partitions)), partitions): processes.append(createdb(blastpath, i, partition, args.outdir, args.title)) for proc in processes: out, err = proc.communicate() print out return 0
def get_attributes(args): """ Gather all data necessary for metrics calculations """ # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE)) # Get precomputed metrics data, key-ed on bibcode metrics_data = get_metrics_data(bibcodes=bibcodes) missing_bibcodes = filter(lambda a: a not in metrics_data.keys(), bibcodes) if len(missing_bibcodes) > 0: app.logger.error("Bibcodes found with missing metrics data: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) bibcodes_without_authnums = map(lambda b: b['_id'],filter(lambda a: a['author_num'] == 0, metrics_data.values())) if len(bibcodes_without_authnums): app.logger.error("Bibcodes found with author number equal to zero: %s" % ",".join(bibcodes_without_authnums)) bibcodes = filter(lambda a: a not in bibcodes_without_authnums, bibcodes) # Get the number of citing papers Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values()))))) # Nciting_ref refers to citation to the refereed papers in the set Nciting_ref = len(list(set(itertools.chain(*map(lambda b: b['citations'], filter(lambda a: a['refereed']==True,metrics_data.values())))))) # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,metrics_data) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
def _cochlear_trim_sai_marginals(filename_and_indexes): try: filename, norm_segstart, norm_segend, audio_id, NAP_detail = filename_and_indexes sai_video_filename = '{}_sai_video_{}'.format(filename, NAP_detail) if os.path.isfile('{}.npy'.format(sai_video_filename)): return sai_video_filename if NAP_detail == 'high': try: NAP = utils.csv_to_array(filename+'cochlear'+NAP_detail) except: NAP = brain.cochlear(filename, stride=1, rate=44100, apply_filter=0, suffix='cochlear'+NAP_detail) if NAP_detail == 'low': try: NAP = utils.csv_to_array(filename+'cochlear'+NAP_detail) except: NAP = brain.cochlear(filename, stride=IO.NAP_STRIDE, rate=IO.NAP_RATE, apply_filter=0, suffix='cochlear'+NAP_detail) # Seems to work best, in particular when they are all the same. num_channels = NAP.shape[1] input_segment_width = 2048 sai_params = CreateSAIParams(num_channels=num_channels, input_segment_width=input_segment_width, trigger_window_width=input_segment_width, sai_width=1024) sai = pysai.SAI(sai_params) NAP = utils.trim_right(NAP[ np.int(np.rint(NAP.shape[0]*norm_segstart)) : np.int(np.rint(NAP.shape[0]*norm_segend)) ], threshold=.05) sai_video = [ np.copy(sai.RunSegment(input_segment.T)) for input_segment in utils.chunks(NAP, input_segment_width) ] del NAP np.save(sai_video_filename, np.array([ sai_rectangles(frame) for frame in sai_video ])) return sai_video_filename except: print utils.print_exception('Calculation SAI video failed for file {}, NAP detail {}'.format(filename, NAP_detail)) return False
def process(self, process_id, date_str, app_ids): """ print self.error_proxy_dict :param date_str: :param app_ids: :return: """ print 'Started process, need to scrape {}'.format(len(app_ids)) logger.info('Started process, need to scrape {}'.format(len(app_ids))) for batch_app_ids in chunks(app_ids, DEFAULT_BATCH_SIZE): for app_id in batch_app_ids: app_detail_key = DETAIL_SOURCE_KEY.format(date=date_str, market=self.market, app_id=app_id) if self.redis_service.exists(app_detail_key): continue content = self._scrape(app_id) if content: self._save(app_detail_key, content) garbage_number = gc.collect() print 'Garbage number:', garbage_number print 'Succeed process {}'.format(process_id) logger.info('Succeed process {}'.format(process_id))
def move_to_destination(cls, response, destination): # destination is a pymongo MongoDB object for collection, items in response.items(): print('writing collection:', collection, len(items)) for chunk in utils.chunks(50, items): destination[collection].insert_many(chunk)
def bt_nodes_info_from_raw_data(data): return [bt_contact_node(x) for x in chunks(data, 26)]
def test_chunks(self): '''Test if the chunks method behaves properly''' list = ['a', 'b', 'c', 'd'] expected = [['a'], ['b'], ['c'], ['d']] self.assertEqual([x for x in chunks(list, 1)], expected)
def get_codons(seq): seq = seq.upper() codons = ut.chunks(seq,3) return codons
def from_obspy(cls, stream, params=None): try: import obspy except ModuleNotFoundError: print("Install obspy with `conda install -c conda-forge obspy`.") data = np.stack(t.data for t in stream.traces) if params is None: params = {} dt = params.get('dt', stream.binary_file_header.sample_interval_in_microseconds) # ndim param can force 2d or 3d data ndim = params.get('ndim', 0) if ndim: params.pop('ndim') # Make certain it winds up in seconds. Most likely 0.0005 to 0.008. while dt > 0.02: dt *= 0.001 params['dt'] = dt # Since we have the headers, we can try to guess the geometry. threed = False # Get the sawtooth header field. In a perfect workd, only works for 3D. xlines = utils.get_pattern_from_stream(stream, patterns.sawtooth) if np.any(xlines) and (ndim != 2): threed = True nxlines = np.amax(xlines) - np.amin(xlines) + 1 params['nxlines'] = params.get('nxlines') or nxlines params['xlines'] = params.get('xlines') or xlines params['dimensions'] = ['i', 'x', 't'] else: xlines = utils.get_pattern_from_stream(stream, patterns.monotonic) if np.any(xlines): nxlines = np.amax(xlines) - np.amin(xlines) + 1 params['nxlines'] = params.get('nxlines') or nxlines params['xlines'] = params.get('xlines') or xlines params['dimensions'] = ['i', 't'] params['ninlines'] = 1 if threed: inlines = utils.get_pattern_from_stream(stream, patterns.stairstep) if np.any(inlines): ninlines = np.amax(inlines) - np.amin(inlines) + 1 params['ninlines'] = params.get('ninlines') or ninlines params['inlines'] = params.get('inlines') or inlines header = np.array(list(stream.textual_file_header.decode())) params['header'] = '\n'.join(c for c in utils.chunks(header, 80)) headers = { 'elevation': 'receiver_group_elevation', 'fold': 'number_of_horizontally_stacked_traces_yielding_this_trace', 'water_depth': 'water_depth_at_group', } for k, v in headers.items(): params[k] = [t.header.__dict__[v] for t in stream.traces] return cls(data, params=params)
path_output.mkdir() for folder in local_folders: shutil.rmtree(folder, ignore_errors=True) #################### start of single url download #################### args.url_file = url # download() depends on global level arg variable month = extract_month(args.url_file) # in case we are resuming from a previous run completed_uids, state_fp, prev_cid = get_state(month, args.output_dir) # URLs we haven't scraped yet (if first run, all URLs in file) url_entries = load_urls(args.url_file, completed_uids, args.max_urls) pool = mpl.Pool(args.n_procs) # process one "chunk" of args.chunk_size URLs at a time for i, chunk in enumerate(chunks(url_entries, args.chunk_size)): cid = prev_cid + i + 1 print("Downloading chunk {}".format(cid)) t1 = time.time() if args.timeout > 0: # imap as iterator allows .next() w/ timeout. # ordered version doesn't seem to work correctly. # for some reason, you CANNOT track j or chunk[j] in the loop, # so don't add anything else to the loop below! # confusingly, chunksize below is unrelated to our chunk_size chunk_iter = pool.imap_unordered(download, chunk, chunksize=1) cdata = [] for j in range(len(chunk)): try:
def __call__(self, base, shape, nnear=None, majority=True, pickle_name=None): """ For each query point in the base array, find the K nearest neighbors and calculate either the majority value or the inverse-weighted value for those neighbors. Keyword arguments: base -- output array (x, y) nnear -- number of neighbors to check majority -- boolean: whether to use the majority algorithm pickle -- boolean: save variables for pickling """ # Set nearest neighbors to default value of 11 if not set. if nnear is None: nnear = 11 if self.canCL and self.wantCL: # These values do not change from run to run. values_buf = cla.to_device(self.queue, self.values) tree_buf = cla.to_device(self.queue, self.tree) coords_buf = cla.to_device(self.queue, self.coords) lentree_arg = np.uint32(len(self.tree)) nnear_arg = np.uint32(nnear) usemajority_arg = np.uint32(1 if majority else 0) # Calculate how many base elements can be evaluated per run. static_data = self.values.nbytes + self.tree.nbytes + self.coords.nbytes + lentree_arg.nbytes + nnear_arg.nbytes + usemajority_arg.nbytes # Each base element is two float32s (8 bytes). bpe_single = 2*4 # Each retval is one int32 (4 bytes). bpe_total = bpe_single + 4 # Check both single and total limits for elems-per-slice. eps_single = [int(0.95*device.max_mem_alloc_size/bpe_single) for device in self.devices] eps_total = [int((0.95*device.global_mem_size-static_data)/bpe_total) for device in self.devices] elem_limits = [min(eps_single[x], eps_total[x]) for x in xrange(len(self.devices))] # For now, at least, do not create retval or chunk buffer here. results = [] # NB: Only supporting one device for now. best_device = np.argmax(elem_limits) global_size = self.global_size[self.devices[best_device]] local_size = self.local_size[self.devices[best_device]] for chunk in chunks(base, elem_limits[best_device]): # Create retvals and chunk buffer here instead of above. lenchunk = len(chunk) retvals_arr = np.empty(lenchunk, dtype=np.int32) retvals_buf = cla.to_device(self.queue, retvals_arr) chunk_buf = cla.to_device(self.queue, chunk) lenchunk_arg = np.uint32(lenchunk) event = self.program.idt(self.queue, global_size, local_size, retvals_buf.data, values_buf.data, tree_buf.data, coords_buf.data, lentree_arg, chunk_buf.data, lenchunk_arg, nnear_arg, usemajority_arg) event.wait() # Copy retvals_buf to results. retvals_arr = retvals_buf.get() if results == []: results = retvals_arr.tolist() else: results += retvals_arr.tolist() else: # from invdisttree.py distances, indexes = self.tree.query(base, k=nnear) results = np.zeros((len(distances),) + np.shape(self.values[0])) jinterpol = 0 for distance, index in zip(distances, indexes): if nnear == 1: wz = self.values[index] elif distance[0] < 1e-10: wz = self.values[index[0]] else: w = 1/distance w /= np.sum(w) if majority: majordict = dict([(x, 0) for x in self.values[index]]) for zval, wval in zip(self.values[index], w): majordict[zval] += wval wz = max(majordict, key=majordict.get) else: wz = np.dot(w, self.values[index]) results[jinterpol] = wz jinterpol += 1 if pickle_name is not None: # Pickle variables for testing purposes. picklefilename = 'idt-%s-%d.pkl.gz' % (pickle_name, (1 if majority else 0)) print 'Pickling to %s...' % picklefilename f = gzip.open(picklefilename, 'wb') pickle.dump(self.coords, f, -1) pickle.dump(self.values, f, -1) pickle.dump(base, f, -1) pickle.dump(shape, f, -1) pickle.dump(nnear, f, -1) pickle.dump(majority, f, -1) # pickle.dump(results, f, -1) return np.asarray(results, dtype=np.uint32).reshape(shape)
def train_network( features, labels, alpha, dump_location ): features_1 = features labels_1 = labels features_2 = features.copy() labels_2 = labels.copy() model = confidence_network() model.train() optimizer = torch.optim.Adam( model.parameters() , lr = 0.0001 ) criterion = torch.nn.CrossEntropyLoss() for epoch in range( 10 ): index_1 = np.arange(len(features_1)) random.shuffle(index_1) index_2 = np.arange(len(features_2)) random.shuffle(index_2) features_1 = features_1[index_1] labels_1 = labels_1[index_1] features_2 = features_2[index_2] labels_2 = labels_2[index_2] total = 0 correct_pred = 0 for step, ( f1, l1, f2, l2 ) in enumerate( zip(chunks(features_1),chunks(labels_1), chunks(features_2), chunks(labels_2)) ): try: f1 = make_tensor(f1) f2 = make_tensor(f2) # norm_features = alpha * f1 + (1-alpha)*f2 # norm_features = make_tensor(norm_features) labels = make_tensor(np.vstack([l1,l2]),dtype=torch.long) optimizer.zero_grad() preds_1 = model(f1,f2,alpha) loss_1 = loss_norm( preds_1, labels, alpha, step ) f_s_1, f_s_2, l_s = same_class_sample() f_s_1, f_s_2 = make_tensor(f_s_1), make_tensor(f_s_2) preds_2 = model(f_s_1,f_s_2,alpha) l_s = make_tensor(l_s, dtype=torch.long) loss_2 = criterion(preds_2,l_s) if(epoch <2): loss = loss_1 else: loss = loss_1 + loss_2/10 loss.backward() optimizer.step() _, predicted = torch.max(preds_2.data,1) total += l_s.size(0) correct_pred += (predicted == l_s).sum().item() except ValueError: pass if(step % 200 == 0): print("epoch {}, step {}, loss_1 {:.4f}, loss_2 {:.4f}".format(epoch,step, loss.data.item(), 0)) print("++++++++++++++++") print("accuracy after {} epochs is {}".format(epoch,correct_pred/total)) torch.save(model,dump_location) return model
if __name__ == "__main__": random.seed(config.SEED) if not os.path.isdir(config.DATA_AR_FOLDER): print('ERR: {} does not exist'.format(config.DATA_AR_FOLDER)) cik_folders = [ os.path.join(config.DATA_AR_FOLDER, d) for d in os.listdir(config.DATA_AR_FOLDER) if os.path.isdir(os.path.join(config.DATA_AR_FOLDER, d)) ] random.shuffle(cik_folders) # Better separate work load if config.MULTITHREADING: folders = utils.chunks(cik_folders, 1 + int(len(cik_folders) / config.NUM_CORES)) procs = [] for i in range(config.NUM_CORES): procs.append( Process(target=process_folder_multithread, args=(folders[i], ))) procs[-1].start() for p in procs: p.join() else: connection = utils.create_mysql_connection() for folder in tqdm.tqdm(cik_folders, desc="Extract data from annual reports"): process_folder(folder, connection)
return training_data, len(words) with open("data.txt") as f: content = f.read() window = 6 time_steps = window - 1 num_hidden = 512 num_input = 1 batch_size = 100 iteration = 250 training_data, num_classes = data_sampling(content, window=window) # Build the Batches: batches = chunks(training_data, batch_size) # RNN output node weights and biases weights = {'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))} biases = {'out': tf.Variable(tf.random_normal([num_classes]))} # tf graph input X = tf.placeholder("float", [None, time_steps, num_input], name='X') Y = tf.placeholder("float", [None, num_classes]) def RNN(x, weights, biases): # Unstack to get a list of 'timesteps' tensors, each tensor has shape (batch_size, n_input) x = tf.unstack(x, time_steps, 1)
]) args = parser.parse_args() curr_batch_order = args.batch_idx d = Dataset(dataset=args.dataset, dataset_type=args.dataset_type, path=args.path) pairs = d.load_dataset() # #Create PCA if not exists filename = os.path.join(args.path, 'pca', 'pca_%s_%s_%s.pkl' % (args.dataset, args.dataset_type, args.emb)) if not os.path.isfile(filename): print('PCA file make!') pca_save(pairs, args.emb, args.bert_encode_type, args.encoder_path, filename, args.pca_components) if args.batch: batches = chunks(pairs,args.batch_size) for i in range(curr_batch_order+1): batch = next(batches) else: batch = pairs print("batch from %d to %d" %(curr_batch_order*args.batch_size, curr_batch_order*args.batch_size+args.batch_size)) #Load (or Save) Encoder File encoder_file = os.path.join(args.path,'bert','{}.{}.{}.{}.pkl'.format (args.dataset, args.dataset_type, args.emb, args.batch_idx)) if not os.path.isfile(encoder_file): e = Encoder(path=args.encoder_path, emb=args.emb, bert_encode_type=args.bert_encode_type) data = e.encode(pairs=batch)
def approximate_mono_image(img, num_coeffs=None, scale_factor=1): """ Approximates a single channel image by using only the first coefficients of the DCT. First, the image is chopped into 8x8 pixels patches and the DCT is applied to each patch. Then, if num_coeffs is provided, only the first K DCT coefficients are kept. If not, all the elements are quantized using the JPEG quantization matrix and the scale_factor. Finally, the resulting coefficients are used to approximate the original patches with the IDCT, and the image is reconstructed back again from these patches. :param img: Image to be approximated. :param num_coeffs: Number of DCT coefficients to use. :param scale_factor: Scale factor to use in the quantization step. :return: The approximated image. """ # prevent against multiple-channel images if len(img.shape) != 2: raise ValueError('Input image must be a single channel 2D array') # shape of image height = img.shape[0] width = img.shape[1] if (height % 8 != 0) or (width % 8 != 0): raise ValueError("Image dimensions (%s, %s) must be multiple of 8" % (height, width)) # split into 8 x 8 pixels blocks img_blocks = [ img[j:j + 8, i:i + 8] for (j, i) in itertools.product(xrange(0, height, 8), xrange(0, width, 8)) ] # DCT transform every 8x8 block dct_blocks = [cv.dct(img_block) for img_block in img_blocks] if num_coeffs is not None: # keep only the first K DCT coefficients of every block reduced_dct_coeffs = [ utils.zig_zag(dct_block, num_coeffs) for dct_block in dct_blocks ] else: # quantize all the DCT coefficients using the quantization matrix and the scaling factor reduced_dct_coeffs = [ np.round(dct_block / (utils.jpeg_quantiz_matrix * scale_factor)) for dct_block in dct_blocks ] # and get the original coefficients back reduced_dct_coeffs = [ reduced_dct_coeff * (utils.jpeg_quantiz_matrix * scale_factor) for reduced_dct_coeff in reduced_dct_coeffs ] # IDCT of every block rec_img_blocks = [ cv.idct(coeff_block) for coeff_block in reduced_dct_coeffs ] # reshape the reconstructed image blocks rec_img = [] for chunk_row_blocks in utils.chunks(rec_img_blocks, width / 8): for row_block_num in xrange(8): for block in chunk_row_blocks: rec_img.extend(block[row_block_num]) rec_img = np.array(rec_img).reshape(height, width) return rec_img