def parse_genebank_file(self,filepath,filename): logger.debug("Parse file {filename}".format(filename=filename)) genebankid = filename.split("_",2) genebankid = genebankid[0]+"_"+genebankid[1] f = zopen(filepath,"r") refseqid = f.readline().split(b" ")[0].lstrip(b">") f.close() self.refseqid_to_GCF[refseqid] = genebankid return
def parse_genomeid2taxid(self, genomes_path,annotation_file): '''To allow NCBI databases to be build from scratch the sequences names needs to be stored in the database, this function parses the accession2taxid file from NCBI to speed up the function and reduce the amount of stored datata only sequences in input genomes_path will be fetched ''' logger.info("Parsing ncbi accession2taxid, genome_path: {dir}".format(dir = genomes_path)) self.refseqid_to_GCF = {} for root, dirs, files in os.walk(genomes_path,followlinks=True): for filename in files: if filename.strip(".gz").endswith(".fna"): filepath = os.path.join(root, filename) self.parse_genebank_file(filepath,filename) logger.info("genomes folder read, {n} sequence files found".format(n=len(self.refseqid_to_GCF))) if not annotation_file.endswith("accession2taxid.gz"): raise TypeError("The supplied annotation file does not seem to be the ncbi nucl_gb.accession2taxid.gz") annotated_genome = set() try: with zopen(annotation_file,"r") as f: headers = f.readline().split(b"\t") for row in f: if row.strip() != "": ## If there are trailing empty lines in the file if len(row.split(b"\t")) > 2: try: refseqid,taxid = row.split(b"\t")[1:3] except: logger.info(row) logger.info(row.split(b"\t")) if len(annotated_genome) > 0: logger.info("Potential error in last row?") else: logger.info("Error on first line in annotation file, check format!") try: genebankid = self.refseqid_to_GCF[refseqid] self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8")) annotated_genome.add(refseqid) except KeyError: pass self.database.commit() except zlib.error as e: logger.info("Error in annotation file {e}".format(e=e)) missing = set(self.refseqid_to_GCF.keys()) - annotated_genome missing = [self.refseqid_to_GCF[m] for m in missing] ## Translate to GCF ids if logging.root.level <=20: ## Equal to --verbose logger.info("Printing non added genome id´s (GCF) to ./FlexTaxD.not_added") self.write_missing(missing) logger.debug(missing) ## If debug also print genomes to terminal logger.info("Genomes not matching any annotation {len}".format(len=len(missing))) return missing
db['burn'] = autoburn(chisq) # re-flag data that we had interpolated over yStar[bad] = nan errStar[bad] = nan db['errStar'] = errStar db['xStar'] = xStar db['wave'] = x db['yStar'] = yStar db['model'] = mdl db['transmission'] = trans db['BDmodel'] = bdspec db['sysUnc'] = sysUnc db['Z'] = Z db['ZP'] = ZP fp = zopen(outdir + '/' + sourceName + '-' + order + '-' + suffix + \ '-Results.pkl.gz', 'w') dump(db, fp) fp.close() # plot the model p = Z[-1] print 'p = ', p x = xgridFromAnchors(len(yStar), p) m = model(p) clf() left, width = 0.1, 0.8 bottom0, height0 = 0.1, 0.3 bottom1, height1 = bottom0 + height0, 0.9 - bottom0 - height0 spect = axes([left, bottom1, width, height1]) resid = axes([left, bottom0, width, height0])
def main(): from time import time from sys import argv import csv config_file = argv[1] configs = [] with open(config_file, "r") as config_file_: for line in config_file_: configs.append(line) available_models = { "FormTwo": FormTwo.FormTwo, "DualLongFormTwo": FormTwo.DualLongFormTwo, "DualShortFormTwo": FormTwo.DualShortFormTwo, "MixedFormTwo": FormTwo.MixedFormTwo } lex_file_name, pos_file_name, tar_file_name, max_length, batch_size, epochs, hp_lex, chkpt_dir, architecture, delimitter, encoding = make_global_vars( configs) try: architecture = available_models[architecture] except KeyError as kE: print(kE) print() print( "##############################################################\n") print("invalid architecture name. Please choose one of the following:") print(list(available_models.keys())) print() print( "##############################################################\n") exit(1) lex_input = zopen(lex_file_name, 'rb') pos_input = zopen(pos_file_name, 'rb') tar_input = zopen(tar_file_name, 'rb') # input_files = [join(input_dir, f_) for f_ in listdir(input_dir) if isfile(join(input_dir, f_))] # # with open(pos_tok_file, "rb") as dict_file: # pos_tokeniser = pickle.load(dict_file) # with open(lex_tok_file, "rb") as dict_file: # lex_tokeniser = pickle.load(dict_file) learning_rate = CustomSchedule(hp_lex["dim"]) opt_adam = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') pred_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction='none') pred_loss_mean = tf.keras.metrics.Mean(name='prediction_train_loss') lex_train_loss_mean = tf.keras.metrics.Mean(name='lexical_train_loss') lex_perplexity = tf.keras.metrics.Mean(name='lexical_perplexity') lex_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='lexical_train_accuracy') pos_train_loss_mean = tf.keras.metrics.Mean(name='pos_train_loss') pos_perplexity = tf.keras.metrics.Mean(name='pos_perplexity') pos_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='pos_train_accuracy') pred_train_accuracy = tf.keras.metrics.Accuracy(name='pred_train_accuracy') metrics_history = { 'lex loss': [], 'lex perp': [], 'lex acc': [], 'pos loss': [], 'pos perp': [], 'pos acc': [], 'pred loss': [], 'pred acc': [] } def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) perp_ = tf.exp(loss_) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask perp_ *= mask return tf.reduce_mean(loss_), tf.reduce_mean(perp_) form_two = architecture( hp_lex["layers"], hp_lex["dim"], hp_lex["pos_dim"], hp_lex["dff"], hp_lex["heads"], hp_lex["vocab"] + 4, # start, end, unknown, special tokens hp_lex["pos_vocab"] + 4, rate=hp_lex["rate"], max_len=max_length) count_p = tf.keras.backend.count_params checkpoint_path = "./checkpoints/form2/" + chkpt_dir ckpt = tf.train.Checkpoint(transformer=form_two, optimizer=opt_adam) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest Form2 checkpoint restored!!') @tf.function def train_step(inputs_, targets_, trn_lbl, training=True): lex_targets_, pos_targets_, pred_targets_ = targets_ pred_targets_ = tf.expand_dims(pred_targets_, axis=0) with tf.GradientTape(persistent=True) as tape: words_out, pos_out, pred_out = form_two(inputs_, turn_labels=trn_lbl, training=training) """ inputs: tuple( words, pos ) words = list [target, history...] pos = list [target, history...] history -> most ancient ... most recent """ lex_loss, lex_perp = loss_function(lex_targets_, words_out) pos_loss, pos_perp = loss_function(pos_targets_, pos_out) pred_loss_val = pred_loss(pred_targets_, pred_out) trn_vars = form_two.trainable_variables pos_grads = tape.gradient(pos_loss, trn_vars) lex_grads = tape.gradient(lex_loss, trn_vars) pred_grads = tape.gradient(pred_loss_val, trn_vars) opt_adam.apply_gradients(zip(pos_grads, trn_vars)) opt_adam.apply_gradients(zip(lex_grads, trn_vars)) opt_adam.apply_gradients(zip(pred_grads, trn_vars)) del tape lex_train_loss_mean(lex_loss) lex_perplexity(lex_perp) lex_train_accuracy(lex_targets_, words_out) pos_train_loss_mean(pos_loss) pos_perplexity(pos_perp) pos_train_accuracy(pos_targets_, pos_out) pred_loss_mean(pred_loss_val) pred_train_accuracy(pred_targets_, pred_out) return for epoch in range(epochs): print() lex_input.seek(0) pos_input.seek(0) tar_input.seek(0) lex_train_loss_mean.reset_states() lex_train_accuracy.reset_states() pos_train_loss_mean.reset_states() pos_train_accuracy.reset_states() start_ = time() step_count = 1 for lex, pos, tar in zip(lex_input, pos_input, tar_input): """ inputs: tuple( words, pos ) words = list [target, history...] pos = list [target, history...] history -> most ancient ... most recent targets = lex_targets, pos_targets, pred_targets """ lex = [ str2tensor(context) for context in lex.decode(encoding).split(delimitter) ] pos = [ str2tensor(context) for context in pos.decode(encoding).split(delimitter) ] tar = tar.decode(encoding).split(delimitter) turns = str2int(tar[0]) targets = [tf.convert_to_tensor(str2int(item)) for item in tar[1:]] inputs = (lex, pos) train_step(inputs, targets, turns) if step_count % (batch_size * 50) == 0: print('Epoch {} Batch {}'.format(epoch + 1, int(step_count / batch_size))) print('Lexical: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'. format(lex_train_loss_mean.result(), lex_perplexity.result(), lex_train_accuracy.result())) print('POS: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'. format(pos_train_loss_mean.result(), pos_perplexity.result(), pos_train_accuracy.result())) print('Label: Loss {:.4f} Accuracy {:.4f}'.format( pred_loss_mean.result(), pred_train_accuracy.result())) step_count += 1 trainable = None if not trainable: trainable = np.sum( [count_p(p) for p in set(form_two.trainable_weights)]) print() print('Epoch {}'.format(epoch + 1)) print('time taken for epoch: {} secs'.format(time() - start_)) ckpt_save_path = ckpt_manager.save() print('saving checkpoint for epoch {} at {}'.format( epoch + 1, ckpt_save_path)) print('total trainable variables: {}'.format(trainable)) # print('sample results') # print('lexical: ' + str(tf.argmax(res_lex, axis=-1).numpy())) # print('pos: ' + str(tf.argmax(res_pos, axis=-1).numpy())) # print('label: ' + str(res_pred.numpy())) print('Lexical: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.format( lex_train_loss_mean.result(), lex_perplexity.result(), lex_train_accuracy.result())) metrics_history['lex loss'].append(float(lex_train_loss_mean.result())) metrics_history['lex perp'].append(float(lex_perplexity.result())) metrics_history['lex acc'].append(float(lex_train_accuracy.result())) print('POS: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.format( pos_train_loss_mean.result(), pos_perplexity.result(), pos_train_accuracy.result())) metrics_history['pos loss'].append(float(pos_train_loss_mean.result())) metrics_history['pos perp'].append(float(pos_perplexity.result())) metrics_history['pos acc'].append(float(pos_train_accuracy.result())) print('Label: Loss {:.4f} Accuracy {:.4f}'.format( pred_loss_mean.result(), pred_train_accuracy.result())) metrics_history['pred loss'].append(float(pred_loss_mean.result())) metrics_history['pred acc'].append(float(pred_train_accuracy.result())) print() for key in metrics_history: max_ = max(metrics_history[key]) if max_ == 0: max_ = 1 metrics_history[key] = [float(i) / max_ for i in metrics_history[key]] with open(chkpt_dir + ' metrics history.csv', 'w') as csv_file: wr = csv.writer(csv_file) keys = list(metrics_history.keys()) wr.writerows([keys]) wr.writerows(zip(*[metrics_history[key] for key in keys]))
pass #else: # print 'Cannot find ', infile # suffix = suffix.replace('dreamZPT', 'dreamZS') # infile = indir + '/' + sourceName + '-' + order + suffix + 'Results.pkl.gz' # print 'Trying ', infile, ' instead' if not os.path.exists(infile): print 'Error: cannot find ', infile sys.exit(1) print infile outfile = infile.replace('.pkl', 'vsini.txt') print outfile fp = zopen(infile, 'r') db = load(fp) fp.close() lsf = db['FWHM'] * 1000. alpha = db['alpha'] vsini = db['vsini'] cz = db['vr'] T = db['T'] logg = db['logg'] logp = db['ZP'] chisq = db['chisq'] burn = autoburn(chisq, 100) if burn < 1000: burn = 1000 #print burn
def man_page(writer, src, dst): with open(src, encoding="utf-8") as source: rst = source.read().format(version=__version__) with zopen(dst, 'wb') as destination: destination.write(publish_string(source=rst, writer=writer))
from gzip import open as zopen from kerneltree import IntervalTree it = IntervalTree() it.add(1, 5, 7) it.add(2, 3, 4) it.add(0, 1, 5) it.add(3, 4, 42) it.add(1, 2, 349) it.search(0, 2) it.search(3, 3) it.search(10, 200) for i, line in enumerate(zopen("tests/test_file.txt.gz")): start, end, _ = line.split() start, end = int(start), int(end) it.add(start, end, i) print(i)
def make_cola(): """ write formats: lex input: target -> series of contexts pos input: target -> series of contexts targets file: turns -> lex target -> pos target -> label targets :return: """ tagger_name = "pickle_jar/brown_full_tagger.pickle" with open(tagger_name, "rb") as dict_file: tagger = pickle.load(dict_file) with open(pos_tok_name, "rb") as dict_file: pos_tokeniser = pickle.load(dict_file) print("loaded POS tokeniser") with open(lex_tok_name, "rb") as dict_file: lex_tokeniser = pickle.load(dict_file) pos_start = [pos_tokeniser.num_words + 1] pos_end = [pos_tokeniser.num_words + 2] lex_start = [lex_tokeniser.num_words + 1] lex_end = [lex_tokeniser.num_words + 2] training = True if training: raw_file = "C:/Users/admin/Downloads/GLUE data/CoLA/train.tsv" write_file_lex = "cola_train_lex.gz" write_file_pos = "cola_train_pos.gz" write_file_tar = "cola_train_tar.gz" else: raw_file = "C:/Users/admin/Downloads/GLUE data/CoLA/dev.tsv" write_file_lex = "cola_dev_lex.gz" write_file_pos = "cola_dev_pos.gz" write_file_tar = "cola_dev_tar.gz" lines = [] with open(raw_file, "r", encoding='utf-8') as read_file: for raw_line in read_file: line = raw_line.strip().split('\t') lines.append((line[1], line[3])) shuffle(lines) with zopen(write_file_lex, "wb") as w_f_l, zopen(write_file_pos, "wb") as w_f_p, zopen( write_file_tar, "wb") as w_f_t: for line in lines: label, string = line label = ' '.join([str(tkn) for tkn in make_one_hot(int(label), 3)]) string = prepare_raw_string("determine acceptability: " + string) tags = [t for w, t in tagger.tag(string)] # print(string) # print(tags) string = lex_tokeniser.texts_to_sequences([string])[0] string = pad_list(lex_start + string + lex_end, seq_pad_len) string = ' '.join([str(tkn) for tkn in string]) # print(string) w_f_l.write((string + '\n').encode(encoding=encoding)) tags = pos_tokeniser.texts_to_sequences([tags])[0] tags = pad_list(pos_start + tags + pos_end, seq_pad_len) tags = ' '.join([str(tkn) for tkn in tags]) # print(tags) w_f_p.write((tags + '\n').encode(encoding=encoding)) write_ = delimitter.join(["0", string, tags, label]) # print(write_) w_f_t.write((write_ + '\n').encode(encoding=encoding)) return
def make_input_files(paragraphs_raw): """ [ paragraph 1 -> [ sent1=[(w1, t1), ...], sent2=[...], ... ], paragraph 2 -> ... ] :param paragraphs_raw: list of paragraphs :return: """ paragraphs = [] for p in paragraphs_raw: paragraph_ = [] for turn, sent in enumerate(p): paragraph_.extend([ (turn % 2, list_sent) for list_sent in truncate_split(sent, max_sent_len, overlap) ]) paragraphs.append(paragraph_) del paragraphs_raw lengths = [len(p) for p in paragraphs] print("made paragraphs") paras_num = len(paragraphs) indices_in_nested_list = [(p, s) for p in range(paras_num) for s in range(1, lengths[p])] # first index, p = index of list in list. second index, s = index of item in nested list # second loop starts at 1 to skip the first (0) index in each nested list jumbled = [(p - paras_num, s) for p, s in indices_in_nested_list] indices_in_nested_list.extend(jumbled) shuffle(indices_in_nested_list) total = sum(lengths) print("total sentences: {}".format(total)) print("made index list") with open(pos_tok_name, "rb") as dict_file: pos_tokeniser = pickle.load(dict_file) print("loaded POS tokeniser") with open(lex_tok_name, "rb") as dict_file: lex_tokeniser = pickle.load(dict_file) print("loaded lexical tokeniser") pos_start = [pos_tokeniser.num_words + 1] pos_end = [pos_tokeniser.num_words + 2] lex_start = [lex_tokeniser.num_words + 1] lex_end = [lex_tokeniser.num_words + 2] with zopen(data_name + '_lex_inputs.gz', 'wb') as in_lex_f, zopen( data_name + '_targets.gz', 'wb') as tar_f, zopen(data_name + '_pos_inputs.gz', 'wb') as in_pos_f: for index_1, index_2 in indices_in_nested_list: jumble = index_1 < 0 turn, target = paragraphs[index_1][index_2] turns = [turn] if jumble: contexts = [sample(target, len(target))] else: contexts = [target] for index_3 in range(index_2): turn, context = paragraphs[index_1][index_3] turns.append(turn) contexts.append(context) tar_lex, tar_pos = zip(*target) tar_lex = lex_tokeniser.texts_to_sequences([list(tar_lex) ])[0] + lex_end tar_lex = pad_list(tar_lex, seq_pad_len) tar_pos = pos_tokeniser.texts_to_sequences([list(tar_pos) ])[0] + pos_end tar_pos = pad_list(tar_pos, seq_pad_len) write_ = delimitter.join([ ' '.join(str(turn) for turn in turns), ' '.join([str(lex_) for lex_ in tar_lex]), ' '.join([str(pos_) for pos_ in tar_pos]), str(-1 if jumble else 1) ]) tar_f.write((write_ + '\n').encode(encoding=encoding)) in_lex, in_pos = zip(*[tuple(zip(*sent)) for sent in contexts]) in_lex = list(in_lex) in_pos = list(in_pos) for count in range(len(in_lex)): in_lex[count] = lex_tokeniser.texts_to_sequences( [list(in_lex[count])])[0] in_pos[count] = pos_tokeniser.texts_to_sequences( [list(in_pos[count])])[0] if count != 0: in_lex[count] += lex_end in_pos[count] += pos_end in_lex[count] = lex_start + in_lex[count] in_lex[count] = pad_list(in_lex[count], seq_pad_len) in_lex[count] = ' '.join(str(lex_) for lex_ in in_lex[count]) in_pos[count] = pos_start + in_pos[count] in_pos[count] = pad_list(in_pos[count], seq_pad_len) in_pos[count] = ' '.join(str(pos_) for pos_ in in_pos[count]) write_ = delimitter.join(in_lex) in_lex_f.write((write_ + '\n').encode(encoding=encoding)) write_ = delimitter.join(in_pos) in_pos_f.write((write_ + '\n').encode(encoding=encoding)) return
def make_mlm_inputs(sentences_raw): indices = list(range(len(sentences_raw))) shuffle(indices) with open(pos_tok_name, "rb") as dict_file: pos_tokeniser = pickle.load(dict_file) print("loaded POS tokeniser") with open(lex_tok_name, "rb") as dict_file: lex_tokeniser = pickle.load(dict_file) print("loaded lexical tokeniser") pos_start = pos_tokeniser.num_words + 1 pos_end = pos_tokeniser.num_words + 2 pos_mask = 1 lex_start = lex_tokeniser.num_words + 1 lex_end = lex_tokeniser.num_words + 2 lex_mask = 1 valid_inputs_count = 0 with zopen(data_name + "_mlm_lex_inputs.gz", "wb") as lex_f, zopen( data_name + "_mlm_pos_inputs.gz", "wb") as pos_f, zopen(data_name + "_mlm_targets.gz", "wb") as tar_f: for index in indices: sentence = sentences_raw[index] if len(sentence) > max_sent_len: continue valid_inputs_count += 1 _lex, _pos = zip(*sentence) _lex = list(_lex) _pos = list(_pos) _lex = lex_tokeniser.texts_to_sequences([_lex])[0] _pos = pos_tokeniser.texts_to_sequences([_pos])[0] mask = [random() for _ in _lex] inp_lx = [] inp_ps = [] chance = random() < 0.4 label = ' '.join(str(tkn) for tkn in make_one_hot(int(chance), 3)) if chance: inp_lx, inp_ps = zip(*sample(list(zip(_lex, _pos)), len(_lex))) inp_lx = list(inp_lx) inp_ps = list(inp_ps) else: for i, pr in enumerate(mask): if pr > 0.15: inp_lx.append(_lex[i]) inp_ps.append(_pos[i]) else: inp_lx.append(lex_mask) inp_ps.append(pos_mask) inp_lx = pad_list([lex_start] + inp_lx + [lex_end], seq_pad_len) # list of tokens inp_lx = (pad_list([lex_start] + _lex, seq_pad_len), inp_lx) inp_ps = pad_list([pos_start] + inp_ps + [pos_end], seq_pad_len) inp_ps = (pad_list([pos_start] + _pos, seq_pad_len), inp_ps) tar_lx = pad_list(_lex + [lex_end], seq_pad_len) tar_ps = pad_list(_pos + [pos_end], seq_pad_len) write_ = delimitter.join( [' '.join([str(tkn) for tkn in turn]) for turn in inp_lx]) # print(write_) lex_f.write((write_ + '\n').encode(encoding=encoding)) write_ = delimitter.join( [' '.join([str(tkn) for tkn in turn]) for turn in inp_ps]) # print(write_) pos_f.write((write_ + '\n').encode(encoding=encoding)) write_ = delimitter.join([ "0 1", ' '.join([str(tkn) for tkn in tar_lx]), ' '.join([str(tkn) for tkn in tar_ps]), label ]) # print(write_) tar_f.write((write_ + '\n').encode(encoding=encoding)) print(valid_inputs_count) return
if fsys: wildCard += 'fsys-' wildCard += thismod[0] + '-Results.pkl.gz' fl = sorted(glob(wildCard)) print fl vrList = [] vsiniList = [] chiList = [] fwhmList = [] TList = [] gList = [] alphaList = [] for f in fl: fp = zopen(f, 'r') db = load(fp) fp.close() b = db['burn'] if b < 1000: b = 1000 vrList += [db['vr'][b:]] vsiniList += [db['vsini'][b:]] chiList += [db['chisq'][b:] / db['dof']] fwhmList += [db['FWHM'][b:] * 1000.] TList += [db['T'][b:]] gList += [db['logg'][b:]] alphaList += [db['alpha'][b:]] clf() boxplot(vrList, whis='range') xticks(arange(7) + 1, ('32', '33', '34', '35', '36', '37', '38'))