def loadfile(dfile, linect, dformat): '''Load linect lines from labeled data in dfile according to dformat. Return a list of DataPoint instances. dfile -- file name -- one data point per line -- values are separated by commas -- final value is the label dformat -- data format list -- tuples with a function to coerce the vaulue and a function which returns false if the value is invalid eg: >>> fmt = 48 * [(float, lambda x: 0 <= x <= 100 )] + \ 6 * [(int, lambda x: 0 < x )] + \ 1 * [(int, lambda x: x == 1 or x == 0)] ''' print 'Loading "{}"'.format(dfile) data = [] with open(dfile, mode='rb') as fd: with pg.Progress(linect, 2, pg.bar('Lines', 32)) as pr: for line in fd: # clean the data & create the datapoint cleandata = [loadfeature(raw.strip(), fmt) \ for raw, fmt in zip(line.split(','), dformat)] data.append(DataPoint(cleandata[:-1], cleandata[-1])) # indicate progress try: pr.next() except TypeError: break return data
def bulk_insert(db, label, data, into): label = f"Creating {len(data)} {label}" pbar = bar(label, len(data)) while data: chunk = data[:1000] data = data[1000:] db.bulk_insert_mappings(into, chunk) db.commit() pbar.next(len(chunk)) pbar.finish()
def run(X,y,C,kernel): n = len(X) y_pred = np.zeros(n,int) for i in range(n): all_except_i = list(range(i)) + list(range(i+1,n)) X_train = X[all_except_i] y_train = y[all_except_i] alg = SVC(C=C, kernel=kernel) alg.fit(X_train, y_train) loner = X[i] loner = loner.reshape(1, -1) y_pred[i] = alg.predict(loner) #print (y_pred[i]) #progress bar progress.bar(i+1,n,"Performing Leave One Out Cross Validation") err = np.mean(y!=y_pred) print ("LEAVE ONE OUT: err=", err) return y_pred
def from_dataset(cls, dataset): stumps = [] print 'Making distinct stumps and caching mistakes.' maxstumps = len(dataset[0].features) * (len(dataset) - 1 + 2) with pg.Progress(maxstumps, 2, pg.bar('Stumps', 32)) as p: # generate pairs of (index, feature vector) for i, fv in enumerate(it.izip(*(dp.features for dp in dataset))): for t in cls.thresholds(fv): s = i, t, Stump.mistakes((i, t, []), dataset) stumps.append(s) p.next() return stumps
def main(): """Main loop""" start_time = time.time() options, arg = interface() motd() print "Started: ", time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) conf = ConfigParser.ConfigParser() conf.read(options.conf) # build our configuration params = Parameters(conf) conn = MySQLdb.connect(user=params.user, passwd=params.pwd, db=params.db) cur = conn.cursor() # crank out a new table for the data createSeqTable(cur) conn.commit() seqcount = sequenceCount(conf.get("Input", "sequence")) sequence = QualityIO.PairedFastaQualIterator( open(conf.get("Input", "sequence"), "rU"), open(conf.get("Input", "qual"), "rU") ) # pdb.set_trace() if conf.getboolean("Multiprocessing", "MULTIPROCESSING"): # get num processors n_procs = conf.get("Multiprocessing", "processors") if n_procs == "Auto": # we'll use x-1 cores (where x = avail. cores) n_procs = multiprocessing.cpu_count() - 1 else: n_procs = int(n_procs) print "Multiprocessing. Number of processors = ", n_procs # to test with fewer sequences # count = 0 try: threads = [] pb = progress.bar(0, seqcount, 60) pb_inc = 0 while sequence: if len(threads) < n_procs: p = multiprocessing.Process(target=linkerWorker, args=(sequence.next(), params)) p.start() threads.append(p) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 else: for t in threads: if not t.is_alive(): threads.remove(t) except StopIteration: pass else: print "Not using multiprocessing" count = 0 try: pb = progress.bar(0, seqcount, 60) pb_inc = 0 # while count < 1000: while sequence: # count +=1 linkerWorker(sequence.next(), params) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 except StopIteration: pass print "\n" cur.close() conn.close() end_time = time.time() print "Ended: ", time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print "\nTime for execution: ", (end_time - start_time) / 60, "minutes"
test_x = tests[index][0] test_y = tests[index][-1][CFGS["DATA"][y_tag]] # dimensionality reduction if dec_flag: decomodel,train_x_ = dimensionality_reduction(train_x, n_dim=6, method="TruncatedSVD") test_x_ = decomodel.transform(test_x) else: train_x_,test_x_ = train_x, test_x # sklearn model best_score = 0.0 best_model = None fit_time = 0.0 pred_time = 0.0 for regressor in bar(regressors): with Timer() as t: model = regressor() model.fit(train_x_, train_y) fit_time = t.toc() y_true = test_y.to_numpy().astype(float) with Timer() as t: y_pred = model.predict(test_x_) pred_time = t.toc() accuracy = calc_accuracy(y_true, y_pred, display=True) result[y_tag]["info"][regressor] = { "r2_score": accuracy["score"]["r2_score"], "fit_time": fit_time, "pred_time": pred_time, }
value = '' for v in np.reshape(H_ba, (9)): value += str(v) + ',' f_ba.write(idxn + value + '\n') value = '' for v in np.reshape(delta_AB, (8)): value += str(v) + ',' fd_ab.write(idxn + value + '\n') value = '' for v in np.reshape(delta_BA, (8)): value += str(v) + ',' fd_ba.write(idxn + value + '\n') save_file_name = str(idx).zfill(8) + '.png' A_save_path = os.path.join(Image_A_dir, save_file_name) B_save_path = os.path.join(Image_B_dir, save_file_name) cv2.imwrite(A_save_path, img_A) cv2.imwrite(B_save_path, img_B) bar(0, idx, num) f_ab.close() f_ba.close() fd_ab.close() fd_ba.close() fidxn.close() #%%
def main(): '''Main loop''' start_time = time.time() options, arg = interface() motd() print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) conf = ConfigParser.ConfigParser() conf.read(options.conf) # build our configuration params = Parameters(conf) conn = MySQLdb.connect(user=params.user, passwd=params.pwd, db=params.db) cur = conn.cursor() # crank out a new table for the data createSeqTable(cur) conn.commit() seqcount = sequenceCount(conf.get('Input', 'sequence')) sequence = QualityIO.PairedFastaQualIterator( open(conf.get('Input', 'sequence'), "rU"), open(conf.get('Input', 'qual'), "rU")) #pdb.set_trace() if conf.getboolean('Multiprocessing', 'MULTIPROCESSING'): # get num processors n_procs = conf.get('Multiprocessing', 'processors') if n_procs == 'Auto': # we'll use x-1 cores (where x = avail. cores) n_procs = multiprocessing.cpu_count() - 1 else: n_procs = int(n_procs) print 'Multiprocessing. Number of processors = ', n_procs # to test with fewer sequences #count = 0 try: threads = [] pb = progress.bar(0, seqcount, 60) pb_inc = 0 while sequence: if len(threads) < n_procs: p = multiprocessing.Process(target=linkerWorker, args=( sequence.next(), params, )) p.start() threads.append(p) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 else: for t in threads: if not t.is_alive(): threads.remove(t) except StopIteration: pass else: print 'Not using multiprocessing' count = 0 try: pb = progress.bar(0, seqcount, 60) pb_inc = 0 #while count < 1000: while sequence: #count +=1 linkerWorker(sequence.next(), params) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 except StopIteration: pass print '\n' cur.close() conn.close() end_time = time.time() print 'Ended: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print '\nTime for execution: ', (end_time - start_time) / 60, 'minutes'
def bar(rn, fill='.'): import time loading = '\b' * rn # for strings, * is the repeat operator rest = fill * int(100 - rn) # this loop replaces each dot with a hash! print('[\r%0s%1s] loading at %2d percent!' % (loading, rest, rn), end='\n') if __name__ == '__main__': for rn in range(1, 101): bar(rn) def get_commands(args): if len(args) < 3: return ret = {} begin = 2 while begin < len(args): ret[str(str(str(args[start]).replace('-', '', 1)).replace('--', '', 1))] = args[int(start + 1)] return ret from progress.bar import * from progress.spinner import * from progress.counter import * class progress_types: __all__ = ['bar', 'charging_bar', 'filling_sqares_bar', 'filling_circles_bar', 'incremental_bar', 'pixel_bar', 'shady_bar', 'spinner', 'pie_spinner', 'moon_spinner', 'line_spinner', 'pixel_spinner', 'counter', 'countdown', 'stack', 'pie']
def main(): """Main loop""" start_time = time.time() options, arg = interface() motd() print "Started: ", time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) conf = ConfigParser.ConfigParser() conf.read(options.conf) conn = MySQLdb.connect( user=conf.get("Database", "USER"), passwd=conf.get("Database", "PASSWORD"), db=conf.get("Database", "DATABASE") ) cur = conn.cursor() qualTrim = conf.getboolean("Steps", "TRIM") qual = conf.getint("Qual", "MIN_SCORE") linkerTrim = conf.getboolean("Steps", "LINKERTRIM") if qualTrim and not linkerTrim: createQualSeqTable(cur) conn.commit() elif qualTrim and linkerTrim: mid, reverse_mid = dict(conf.items("MID")), reverse(conf.items("MID")) linkers, reverse_linkers = dict(conf.items("Linker")), reverse(conf.items("Linker")) # TODO: Add levenshtein distance script to automagically determine # distance reverse_mid[None] = None reverse_linkers[None] = None clust = conf.items("Clusters") # build tag library 1X tags = tagLibrary(mid, linkers, clust) all_tags, all_tags_regex = allPossibleTags(mid, linkers, clust) # crank out a new table for the data createSeqTable(cur) conn.commit() seqcount = sequenceCount(conf.get("Input", "sequence")) record = QualityIO.PairedFastaQualIterator( open(conf.get("Input", "sequence"), "rU"), open(conf.get("Input", "qual"), "rU") ) # pdb.set_trace() if conf.getboolean("Multiprocessing", "MULTIPROCESSING"): # get num processors n_procs = conf.get("Multiprocessing", "processors") if n_procs == "Auto": # TODO: change this? # we'll start 2X-1 threads (X = processors). n_procs = multiprocessing.cpu_count() - 1 else: n_procs = int(n_procs) print "Multiprocessing. Number of processors = ", n_procs # to test with fewer sequences # count = 0 try: threads = [] pb = progress.bar(0, seqcount, 60) pb_inc = 0 while record: if len(threads) < n_procs: if qualTrim and not linkerTrim: p = multiprocessing.Process(target=qualOnlyWorker, args=(record.next(), qual, conf)) elif qualTrim and linkerTrim: p = multiprocessing.Process( target=linkerWorker, args=( record.next(), qual, tags, all_tags, all_tags_regex, reverse_mid, reverse_linkers, conf, ), ) p.start() threads.append(p) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 else: for t in threads: if not t.is_alive(): threads.remove(t) except StopIteration: pass else: print "Not using multiprocessing" count = 0 try: pb = progress.bar(0, seqcount, 60) pb_inc = 0 # while count < 1000: while record: # count +=1 if qualTrim and not linkerTrim: qualOnlyWorker(record.next(), qual, conf) elif qualTrim and linkerTrim: linkerWorker( record.next(), qual, tags, all_tags, all_tags_regex, reverse_mid, reverse_linkers, conf ) if (pb_inc + 1) % 1000 == 0: pb.__call__(pb_inc + 1) elif pb_inc + 1 == seqcount: pb.__call__(pb_inc + 1) pb_inc += 1 except StopIteration: pass print "\n" cur.close() conn.close() end_time = time.time() print "Ended: ", time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print "\nTime for execution: ", (end_time - start_time) / 60, "minutes"
def main(): start_time = time.time() print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) options, arg = interface() conf = ConfigParser.ConfigParser() conf.read(options.conf) have_sequence_table = conf.getboolean('MicrosatelliteParameters', 'HaveSequenceTable') db = (conf.get('Database','USER'), conf.get('Database','PASSWORD'), conf.get('Database','DATABASE') ) m_processing = conf.getboolean('Multiprocessing', 'MULTIPROCESSING') num_procs = conf.get('Multiprocessing','processors') fasta_engine = conf.get('MicrosatelliteParameters', 'FastaEngine').lower() try: data_type = conf.get('Input', 'Type') except: data_type = 'fasta' # build our configuration conn = MySQLdb.connect( user = db[0], passwd = db[1], db = db[2] ) cur = conn.cursor() # get groups of 1000 sequences, package them, send them to muliprocessing # for primer design db_chunk = [] # get count of total combined msats in the table cur.execute('''SELECT sequence.name, combined.sequence_id, combined.id, combined.start, combined.end FROM sequence, combined where combined.sequence_id = sequence.id''') sequence_ids = cur.fetchall() # split our msats into chunks or blocks of 1000 reads, one of which we # will sequentially pass to each processing core across the set of reads if len(sequence_ids) > 1000: for span in range(0,len(sequence_ids), 1000): db_chunk.append(sequence_ids[span:span + 1000]) db_chunk_len = float(len(db_chunk)) db_chunk = iter(db_chunk) data = Sequence(input = conf.get('Input','sequence'), engine = fasta_engine, data_type = data_type) # we only need a single instance of our primer design settings settings = p3wrapr.primer.Settings() settings.basic() # ensure that we get pretty different primers for each set of X designed # => offset by 10 bp. this makes primer3 somewhat slower. settings.params['PRIMER_MIN_THREE_PRIME_DISTANCE'] = 10 # override the default location for the mispriming library settings.params['PRIMER_MISPRIMING_LIBRARY'] = '/Users/bcf/Bin/misprime_lib_weight' # we only need a single instance of our tag settings tag_settings = p3wrapr.primer.Settings() tag_settings.reduced(PRIMER_PICK_ANYWAY=1) # create primers and tagged primers table create_primers_table(cur) create_tagged_primers_table(cur) if m_processing: if num_procs == 'Auto': num_procs = multiprocessing.cpu_count() - 2 else: num_procs = int(num_procs) print 'Multiprocessing. Number of processors = %s\n' % num_procs threads = [] # access the data on sequence by sequence basis to avoid reading the # entire table contents into memory pb = progress.bar(0,db_chunk_len,60) pb_inc = 0 try: while db_chunk: if len(threads) < num_procs: container = get_sequence_for_chunk(data, db_chunk.next()) p = multiprocessing.Process(target=worker, args=(db, container, settings, tag_settings)) p.start() threads.append(p) if (pb_inc+1)%1 == 0: pb.__call__(pb_inc+1) elif pb_inc + 1 == db_chunk_len: pb.__call__(pb_inc+1) pb_inc += 1 else: for t in threads: if not t.is_alive(): threads.remove(t) except StopIteration: pass # for single processing else: try: while db_chunk: if have_sequence_table: pass else: container = get_sequence_for_chunk(data, db_chunk.next()) worker(db, container, settings, tag_settings) except StopIteration: pass print '\n' cur.close() conn.close() end_time = time.time() print 'Ended: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print '\nTime for execution: ', (end_time - start_time)/60, 'minutes'
1 * [(int, lambda x: 0 <= x)] +\ 1 * [(int, lambda x: x == 1 or x == 0)] # TABLE_ = '+------------+------------+------------+------------+------------+------------+------------+' TABLEH = '| Test Fold | Test Ct | Train Ct | Operat Pnt | FP Rate | FN Rate | Error Rate |' TABLE = '| {: ^10} | {: >10} | {: >10} | {: <10.4f} | {: <10.8f} | {: <10.8f} | {: <10.8f} |' # FOLDCOUNT = 10 ROCFOLD = 0 folds = [[] for i in xrange(FOLDCOUNT)] k = 0 # kurrent fold # print print 'Loading "{}"'.format(DATAFILE) with open(DATAFILE, mode='rb') as fd: with pg.Progress(4601, timeout=2, callback=pg.bar('Loading', 32)) as pr: for line in fd: # clean the data & create the datapoint cleandata = [load(raw.strip(), fmt) for raw, fmt in zip(line.split(','), DATAFORMAT)] datapoint = {'features':cleandata[:-1], 'label':cleandata[-1]} # add to the current fold & switch to the next fold folds[k].append(datapoint) k = (k + 1) % FOLDCOUNT # indicate progress pr.next() # results = collections.defaultdict(list) for name, model in [(m.__name__, m.model) for m in [bernoulli, gaussian, histogram]]: print print '10-fold cross-validation for {} Naive Bayes'.format(name.capitalize()) with pg.Progress(FOLDCOUNT, timeout=4, callback=pg.bar(name.capitalize(), 32)) as pr:
def main(): start_time = time.time() options, arg = interface() motd() print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) conf = ConfigParser.ConfigParser() conf.read(options.conf) # ============================================= # = Setup additional configuration parameters = # ============================================= db = (conf.get('Database','USER'), conf.get('Database','PASSWORD'), conf.get('Database','DATABASE') ) have_sequence_table = conf.getboolean('MicrosatelliteParameters', 'HaveSequenceTable') fasta_engine = conf.get('MicrosatelliteParameters', 'FastaEngine').lower() try: data_type = conf.get('Input', 'Type') except: data_type = 'fasta' combine_loci = conf.getboolean('MicrosatelliteParameters', 'CombineLoci') combine_loci_dist = conf.getint('MicrosatelliteParameters', 'CombineLociDist') m_processing = conf.getboolean('Multiprocessing', 'MULTIPROCESSING') get_num_procs = conf.get('Multiprocessing','processors') conn = MySQLdb.connect(user = db[0], passwd = db[1], db = db[2] ) cur = conn.cursor() # Drop old tables drop_old_tables(cur, have_sequence_table) if have_sequence_table: data = Sequence(engine = 'mysql', cursor = cur) else: # create a quasi-sequence table createSequenceTable(cur) # get out data data = Sequence(engine = fasta_engine, input = conf.get('Input','sequence'), data_type = data_type ) cur.executemany('''INSERT INTO sequence (id, name) VALUES (%s, %s)''', data.db_values) # create our msat table createMaskTableWithForeign(cur) # create the combined msat table if combine_loci: createCombinedLociWithForeign(cur) conn.commit() scan_type = conf.get('MicrosatelliteParameters', 'ScanType') motifs = motifCollection(min_length = [10,6,4,4,4,4], scan_type = scan_type, \ perfect = True) if m_processing: # get num processors n_procs = get_num_procs if n_procs == 'Auto': n_procs = multiprocessing.cpu_count() - 2 else: n_procs = int(n_procs) print 'Multiprocessing. Number of processors = %s\n' % n_procs # to test with fewer sequences #count = 0 threads = [] # access the data on sequence by sequence basis to avoid reading the # entire table contents into memory pb = progress.bar(0,data.readcount,60) pb_inc = 0 try: while data: if len(threads) < n_procs: # convert BLOB back to sequence record if data.engine == 'mysql': # convert BLOB back to sequence record record = data.read.next() iden = record[0] record = cPickle.loads(record[1]) elif data.engine == 'pyfasta' or data.engine == 'biopython': iden, chromo = data.read.next() record = SequenceWrapper(iden, data.fasta[chromo]) elif data.engine == 'twobit': iden, chromo = data.read.next() record = SequenceWrapper(iden, data.fasta[chromo][:]) p = multiprocessing.Process(target=worker, args=( iden, record, motifs, db, have_sequence_table, combine_loci, combine_loci_dist) ) p.start() threads.append(p) if (pb_inc+1)%1000 == 0: pb.__call__(pb_inc+1) elif pb_inc + 1 == data.readcount: pb.__call__(pb_inc+1) pb_inc += 1 else: for t in threads: if not t.is_alive(): threads.remove(t) except StopIteration: pass else: print 'Not using multiprocessing\n' # access the data on sequence by sequence basis to avoid # reading the entire table contents into memory pb = progress.bar(0,data.readcount,60) pb_inc = 0 #pdb.set_trace() try: #pdb.set_trace() while data: if data.engine == 'mysql': # convert BLOB back to sequence record record = data.read.next() iden = record[0] record = cPickle.loads(record[1]) elif data.engine == 'pyfasta': iden, chromo = data.read.next() record = SequenceWrapper(iden, data.fasta[chromo]) elif data.engine == 'biopython': iden, chromo = data.read.next() record = data.fasta[chromo] elif data.engine == 'twobit': iden, chromo = data.read.next() record = SequenceWrapper(iden, data.fasta[chromo][:]) #pdb.set_trace() worker(iden, record, motifs, db, have_sequence_table, combine_loci, combine_loci_dist) row = cur.fetchone() if (pb_inc+1)%1000 == 0: pb.__call__(pb_inc+1) elif pb_inc + 1 == data.readcount: pb.__call__(pb_inc+1) pb_inc += 1 except StopIteration: pass print '\n' cur.close() conn.close() end_time = time.time() print 'Ended: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print '\nTime for execution: ', (end_time - start_time)/60, 'minutes'