Python zopen Beispiele, gzip.zopen Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ReadTaxonomyNCBI.py Projekt: FOI-Bioinformatics/flextaxd

	def parse_genebank_file(self,filepath,filename):
		logger.debug("Parse file {filename}".format(filename=filename))
		genebankid = filename.split("_",2)
		genebankid = genebankid[0]+"_"+genebankid[1]
		f = zopen(filepath,"r")
		refseqid = f.readline().split(b" ")[0].lstrip(b">")
		f.close()
		self.refseqid_to_GCF[refseqid] = genebankid
		return

Beispiel #2

0

Datei anzeigen

Datei: ReadTaxonomyNCBI.py Projekt: FOI-Bioinformatics/flextaxd

	def parse_genomeid2taxid(self, genomes_path,annotation_file):
		'''To allow NCBI databases to be build from scratch the sequences names needs to be stored in the database,
			this function parses the accession2taxid file from NCBI to speed up the function and reduce the amount
			of stored datata only sequences in input genomes_path will be fetched
		'''
		logger.info("Parsing ncbi accession2taxid, genome_path: {dir}".format(dir = genomes_path))
		self.refseqid_to_GCF = {}
		for root, dirs, files in os.walk(genomes_path,followlinks=True):
			for filename in files:
				if filename.strip(".gz").endswith(".fna"):
					filepath = os.path.join(root, filename)
					self.parse_genebank_file(filepath,filename)
		logger.info("genomes folder read, {n} sequence files found".format(n=len(self.refseqid_to_GCF)))
		if not annotation_file.endswith("accession2taxid.gz"):
			raise TypeError("The supplied annotation file does not seem to be the ncbi nucl_gb.accession2taxid.gz")
		annotated_genome = set()
		try:
			with zopen(annotation_file,"r") as f:
				headers = f.readline().split(b"\t")
				for row in f:
					if row.strip() != "": ## If there are trailing empty lines in the file
						if len(row.split(b"\t")) > 2:
							try:
								refseqid,taxid = row.split(b"\t")[1:3]
							except:
								logger.info(row)
								logger.info(row.split(b"\t"))
								if len(annotated_genome) > 0:
									logger.info("Potential error in last row?")
								else:
									logger.info("Error on first line in annotation file, check format!")
							try:
								genebankid = self.refseqid_to_GCF[refseqid]
								self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8"))
								annotated_genome.add(refseqid)
							except KeyError:
								pass
				self.database.commit()
		except zlib.error as e:
			logger.info("Error in annotation file {e}".format(e=e))
		missing = set(self.refseqid_to_GCF.keys()) - annotated_genome
		missing = [self.refseqid_to_GCF[m] for m in missing] ## Translate to GCF ids
		if logging.root.level <=20: ## Equal to --verbose
			logger.info("Printing non added genome id´s (GCF) to ./FlexTaxD.not_added")
			self.write_missing(missing)
		logger.debug(missing)  ## If debug also print genomes to terminal
		logger.info("Genomes not matching any annotation {len}".format(len=len(missing)))
		return missing

Beispiel #3

0

Datei anzeigen

db['burn'] = autoburn(chisq)
# re-flag data that we had interpolated over
yStar[bad] = nan
errStar[bad] = nan
db['errStar'] = errStar
db['xStar'] = xStar
db['wave'] = x
db['yStar'] = yStar
db['model'] = mdl
db['transmission'] = trans
db['BDmodel'] = bdspec
db['sysUnc'] = sysUnc
db['Z'] = Z
db['ZP'] = ZP

fp = zopen(outdir + '/' + sourceName + '-' + order + '-' + suffix + \
           '-Results.pkl.gz', 'w')
dump(db, fp)
fp.close()

# plot the model
p = Z[-1]
print 'p = ', p
x = xgridFromAnchors(len(yStar), p)
m = model(p)
clf()
left, width = 0.1, 0.8
bottom0, height0 = 0.1, 0.3
bottom1, height1 = bottom0 + height0, 0.9 - bottom0 - height0

spect = axes([left, bottom1, width, height1])
resid = axes([left, bottom0, width, height0])

Beispiel #4

0

Datei anzeigen

Datei: training.py Projekt: E-Loba/language-model

def main():
    from time import time
    from sys import argv
    import csv

    config_file = argv[1]
    configs = []
    with open(config_file, "r") as config_file_:
        for line in config_file_:
            configs.append(line)
    available_models = {
        "FormTwo": FormTwo.FormTwo,
        "DualLongFormTwo": FormTwo.DualLongFormTwo,
        "DualShortFormTwo": FormTwo.DualShortFormTwo,
        "MixedFormTwo": FormTwo.MixedFormTwo
    }
    lex_file_name, pos_file_name, tar_file_name, max_length, batch_size, epochs, hp_lex, chkpt_dir, architecture, delimitter, encoding = make_global_vars(
        configs)
    try:
        architecture = available_models[architecture]
    except KeyError as kE:
        print(kE)
        print()
        print(
            "##############################################################\n")
        print("invalid architecture name. Please choose one of the following:")
        print(list(available_models.keys()))
        print()
        print(
            "##############################################################\n")
        exit(1)

    lex_input = zopen(lex_file_name, 'rb')
    pos_input = zopen(pos_file_name, 'rb')
    tar_input = zopen(tar_file_name, 'rb')

    # input_files = [join(input_dir, f_) for f_ in listdir(input_dir) if isfile(join(input_dir, f_))]
    #
    # with open(pos_tok_file, "rb") as dict_file:
    # 	pos_tokeniser = pickle.load(dict_file)
    # with open(lex_tok_file, "rb") as dict_file:
    # 	lex_tokeniser = pickle.load(dict_file)

    learning_rate = CustomSchedule(hp_lex["dim"])
    opt_adam = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98,
                                        epsilon=1e-9)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    pred_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True,
                                                        reduction='none')
    pred_loss_mean = tf.keras.metrics.Mean(name='prediction_train_loss')
    lex_train_loss_mean = tf.keras.metrics.Mean(name='lexical_train_loss')
    lex_perplexity = tf.keras.metrics.Mean(name='lexical_perplexity')
    lex_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='lexical_train_accuracy')
    pos_train_loss_mean = tf.keras.metrics.Mean(name='pos_train_loss')
    pos_perplexity = tf.keras.metrics.Mean(name='pos_perplexity')
    pos_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='pos_train_accuracy')
    pred_train_accuracy = tf.keras.metrics.Accuracy(name='pred_train_accuracy')
    metrics_history = {
        'lex loss': [],
        'lex perp': [],
        'lex acc': [],
        'pos loss': [],
        'pos perp': [],
        'pos acc': [],
        'pred loss': [],
        'pred acc': []
    }

    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)
        perp_ = tf.exp(loss_)
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        perp_ *= mask
        return tf.reduce_mean(loss_), tf.reduce_mean(perp_)

    form_two = architecture(
        hp_lex["layers"],
        hp_lex["dim"],
        hp_lex["pos_dim"],
        hp_lex["dff"],
        hp_lex["heads"],
        hp_lex["vocab"] + 4,  # start, end, unknown, special tokens
        hp_lex["pos_vocab"] + 4,
        rate=hp_lex["rate"],
        max_len=max_length)
    count_p = tf.keras.backend.count_params

    checkpoint_path = "./checkpoints/form2/" + chkpt_dir
    ckpt = tf.train.Checkpoint(transformer=form_two, optimizer=opt_adam)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=5)
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest Form2 checkpoint restored!!')

    @tf.function
    def train_step(inputs_, targets_, trn_lbl, training=True):
        lex_targets_, pos_targets_, pred_targets_ = targets_
        pred_targets_ = tf.expand_dims(pred_targets_, axis=0)
        with tf.GradientTape(persistent=True) as tape:
            words_out, pos_out, pred_out = form_two(inputs_,
                                                    turn_labels=trn_lbl,
                                                    training=training)
            """
			inputs: tuple( words, pos )
					words = list [target, history...]
					pos = list [target, history...]
			history -> most ancient ... most recent
			"""
            lex_loss, lex_perp = loss_function(lex_targets_, words_out)
            pos_loss, pos_perp = loss_function(pos_targets_, pos_out)
            pred_loss_val = pred_loss(pred_targets_, pred_out)
        trn_vars = form_two.trainable_variables
        pos_grads = tape.gradient(pos_loss, trn_vars)
        lex_grads = tape.gradient(lex_loss, trn_vars)
        pred_grads = tape.gradient(pred_loss_val, trn_vars)
        opt_adam.apply_gradients(zip(pos_grads, trn_vars))
        opt_adam.apply_gradients(zip(lex_grads, trn_vars))
        opt_adam.apply_gradients(zip(pred_grads, trn_vars))
        del tape
        lex_train_loss_mean(lex_loss)
        lex_perplexity(lex_perp)
        lex_train_accuracy(lex_targets_, words_out)
        pos_train_loss_mean(pos_loss)
        pos_perplexity(pos_perp)
        pos_train_accuracy(pos_targets_, pos_out)
        pred_loss_mean(pred_loss_val)
        pred_train_accuracy(pred_targets_, pred_out)
        return

    for epoch in range(epochs):
        print()
        lex_input.seek(0)
        pos_input.seek(0)
        tar_input.seek(0)
        lex_train_loss_mean.reset_states()
        lex_train_accuracy.reset_states()
        pos_train_loss_mean.reset_states()
        pos_train_accuracy.reset_states()
        start_ = time()
        step_count = 1
        for lex, pos, tar in zip(lex_input, pos_input, tar_input):
            """
			inputs: tuple( words, pos )
					words = list [target, history...]
					pos = list [target, history...]
			history -> most ancient ... most recent
			targets = lex_targets, pos_targets, pred_targets
			"""
            lex = [
                str2tensor(context)
                for context in lex.decode(encoding).split(delimitter)
            ]
            pos = [
                str2tensor(context)
                for context in pos.decode(encoding).split(delimitter)
            ]
            tar = tar.decode(encoding).split(delimitter)
            turns = str2int(tar[0])
            targets = [tf.convert_to_tensor(str2int(item)) for item in tar[1:]]
            inputs = (lex, pos)
            train_step(inputs, targets, turns)
            if step_count % (batch_size * 50) == 0:
                print('Epoch {} Batch {}'.format(epoch + 1,
                                                 int(step_count / batch_size)))
                print('Lexical: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.
                      format(lex_train_loss_mean.result(),
                             lex_perplexity.result(),
                             lex_train_accuracy.result()))
                print('POS: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.
                      format(pos_train_loss_mean.result(),
                             pos_perplexity.result(),
                             pos_train_accuracy.result()))
                print('Label: Loss {:.4f} Accuracy {:.4f}'.format(
                    pred_loss_mean.result(), pred_train_accuracy.result()))
            step_count += 1
        trainable = None
        if not trainable:
            trainable = np.sum(
                [count_p(p) for p in set(form_two.trainable_weights)])
        print()
        print('Epoch {}'.format(epoch + 1))
        print('time taken for epoch: {} secs'.format(time() - start_))
        ckpt_save_path = ckpt_manager.save()
        print('saving checkpoint for epoch {} at {}'.format(
            epoch + 1, ckpt_save_path))
        print('total trainable variables: {}'.format(trainable))
        # print('sample results')
        # print('lexical: ' + str(tf.argmax(res_lex, axis=-1).numpy()))
        # print('pos: ' + str(tf.argmax(res_pos, axis=-1).numpy()))
        # print('label: ' + str(res_pred.numpy()))
        print('Lexical: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.format(
            lex_train_loss_mean.result(), lex_perplexity.result(),
            lex_train_accuracy.result()))
        metrics_history['lex loss'].append(float(lex_train_loss_mean.result()))
        metrics_history['lex perp'].append(float(lex_perplexity.result()))
        metrics_history['lex acc'].append(float(lex_train_accuracy.result()))
        print('POS: Loss {:.4f} Perplexity {:.4f} Accuracy {:.4f}'.format(
            pos_train_loss_mean.result(), pos_perplexity.result(),
            pos_train_accuracy.result()))
        metrics_history['pos loss'].append(float(pos_train_loss_mean.result()))
        metrics_history['pos perp'].append(float(pos_perplexity.result()))
        metrics_history['pos acc'].append(float(pos_train_accuracy.result()))
        print('Label: Loss {:.4f} Accuracy {:.4f}'.format(
            pred_loss_mean.result(), pred_train_accuracy.result()))
        metrics_history['pred loss'].append(float(pred_loss_mean.result()))
        metrics_history['pred acc'].append(float(pred_train_accuracy.result()))
        print()
    for key in metrics_history:
        max_ = max(metrics_history[key])
        if max_ == 0:
            max_ = 1
        metrics_history[key] = [float(i) / max_ for i in metrics_history[key]]
    with open(chkpt_dir + ' metrics history.csv', 'w') as csv_file:
        wr = csv.writer(csv_file)
        keys = list(metrics_history.keys())
        wr.writerows([keys])
        wr.writerows(zip(*[metrics_history[key] for key in keys]))

Beispiel #5

0

Datei anzeigen

Datei: printMCrun.py Projekt: katelynallers/BD_HIGHRES

    pass
#else:
#    print 'Cannot find ', infile
#    suffix = suffix.replace('dreamZPT', 'dreamZS')
#    infile = indir + '/' + sourceName + '-' + order + suffix + 'Results.pkl.gz'
#    print 'Trying ', infile, ' instead'

if not os.path.exists(infile):
    print 'Error: cannot find ', infile
    sys.exit(1)

print infile
outfile = infile.replace('.pkl', 'vsini.txt')
print outfile

fp = zopen(infile, 'r')

db = load(fp)
fp.close()
lsf = db['FWHM'] * 1000.
alpha = db['alpha']
vsini = db['vsini']
cz = db['vr']
T = db['T']
logg = db['logg']
logp = db['ZP']
chisq = db['chisq']

burn = autoburn(chisq, 100)
if burn < 1000: burn = 1000
#print burn

Beispiel #6

0

Datei anzeigen

Datei: setup.py Projekt: palob/formiko

def man_page(writer, src, dst):
    with open(src, encoding="utf-8") as source:
        rst = source.read().format(version=__version__)
    with zopen(dst, 'wb') as destination:
        destination.write(publish_string(source=rst, writer=writer))

Beispiel #7

0

Datei anzeigen

Datei: setup.py Projekt: ondratu/formiko

def man_page(writer, src, dst):
    with open(src, encoding="utf-8") as source:
        rst = source.read().format(version=__version__)
    with zopen(dst, 'wb') as destination:
        destination.write(publish_string(source=rst, writer=writer))

Beispiel #8

0

Datei anzeigen

from gzip import open as zopen
from kerneltree import IntervalTree

it = IntervalTree()

it.add(1, 5, 7)
it.add(2, 3, 4)
it.add(0, 1, 5)
it.add(3, 4, 42)
it.add(1, 2, 349)

it.search(0, 2)
it.search(3, 3)
it.search(10, 200)

for i, line in enumerate(zopen("tests/test_file.txt.gz")):
    start, end, _ = line.split()
    start, end = int(start), int(end)
    it.add(start, end, i)
    print(i)

Beispiel #9

0

Datei anzeigen

Datei: make_data.py Projekt: E-Loba/language-model

def make_cola():
    """
	write formats:
	lex input: target -> series of contexts
	pos input: target -> series of contexts
	targets file: turns -> lex target -> pos target -> label targets
	:return:
	"""
    tagger_name = "pickle_jar/brown_full_tagger.pickle"
    with open(tagger_name, "rb") as dict_file:
        tagger = pickle.load(dict_file)
    with open(pos_tok_name, "rb") as dict_file:
        pos_tokeniser = pickle.load(dict_file)
    print("loaded POS tokeniser")
    with open(lex_tok_name, "rb") as dict_file:
        lex_tokeniser = pickle.load(dict_file)
    pos_start = [pos_tokeniser.num_words + 1]
    pos_end = [pos_tokeniser.num_words + 2]
    lex_start = [lex_tokeniser.num_words + 1]
    lex_end = [lex_tokeniser.num_words + 2]

    training = True
    if training:
        raw_file = "C:/Users/admin/Downloads/GLUE data/CoLA/train.tsv"
        write_file_lex = "cola_train_lex.gz"
        write_file_pos = "cola_train_pos.gz"
        write_file_tar = "cola_train_tar.gz"
    else:
        raw_file = "C:/Users/admin/Downloads/GLUE data/CoLA/dev.tsv"
        write_file_lex = "cola_dev_lex.gz"
        write_file_pos = "cola_dev_pos.gz"
        write_file_tar = "cola_dev_tar.gz"
    lines = []
    with open(raw_file, "r", encoding='utf-8') as read_file:
        for raw_line in read_file:
            line = raw_line.strip().split('\t')
            lines.append((line[1], line[3]))
    shuffle(lines)
    with zopen(write_file_lex,
               "wb") as w_f_l, zopen(write_file_pos, "wb") as w_f_p, zopen(
                   write_file_tar, "wb") as w_f_t:
        for line in lines:
            label, string = line
            label = ' '.join([str(tkn) for tkn in make_one_hot(int(label), 3)])
            string = prepare_raw_string("determine acceptability: " + string)
            tags = [t for w, t in tagger.tag(string)]
            # print(string)
            # print(tags)
            string = lex_tokeniser.texts_to_sequences([string])[0]
            string = pad_list(lex_start + string + lex_end, seq_pad_len)
            string = ' '.join([str(tkn) for tkn in string])
            # print(string)
            w_f_l.write((string + '\n').encode(encoding=encoding))
            tags = pos_tokeniser.texts_to_sequences([tags])[0]
            tags = pad_list(pos_start + tags + pos_end, seq_pad_len)
            tags = ' '.join([str(tkn) for tkn in tags])
            # print(tags)
            w_f_p.write((tags + '\n').encode(encoding=encoding))
            write_ = delimitter.join(["0", string, tags, label])
            # print(write_)
            w_f_t.write((write_ + '\n').encode(encoding=encoding))
    return

Beispiel #10

0

Datei anzeigen

Datei: make_data.py Projekt: E-Loba/language-model

def make_input_files(paragraphs_raw):
    """
	[ paragraph 1 -> [ sent1=[(w1, t1), ...], sent2=[...], ... ],
	  paragraph 2 -> ...
	]
	:param paragraphs_raw: list of paragraphs
	:return:
	"""
    paragraphs = []
    for p in paragraphs_raw:
        paragraph_ = []
        for turn, sent in enumerate(p):
            paragraph_.extend([
                (turn % 2, list_sent)
                for list_sent in truncate_split(sent, max_sent_len, overlap)
            ])
        paragraphs.append(paragraph_)
    del paragraphs_raw
    lengths = [len(p) for p in paragraphs]
    print("made paragraphs")

    paras_num = len(paragraphs)
    indices_in_nested_list = [(p, s) for p in range(paras_num)
                              for s in range(1, lengths[p])]
    # first index, p = index of list in list. second index, s = index of item in nested list
    # second loop starts at 1 to skip the first (0) index in each nested list
    jumbled = [(p - paras_num, s) for p, s in indices_in_nested_list]
    indices_in_nested_list.extend(jumbled)
    shuffle(indices_in_nested_list)
    total = sum(lengths)
    print("total sentences: {}".format(total))
    print("made index list")

    with open(pos_tok_name, "rb") as dict_file:
        pos_tokeniser = pickle.load(dict_file)
    print("loaded POS tokeniser")
    with open(lex_tok_name, "rb") as dict_file:
        lex_tokeniser = pickle.load(dict_file)
    print("loaded lexical tokeniser")
    pos_start = [pos_tokeniser.num_words + 1]
    pos_end = [pos_tokeniser.num_words + 2]
    lex_start = [lex_tokeniser.num_words + 1]
    lex_end = [lex_tokeniser.num_words + 2]

    with zopen(data_name + '_lex_inputs.gz', 'wb') as in_lex_f, zopen(
            data_name + '_targets.gz',
            'wb') as tar_f, zopen(data_name + '_pos_inputs.gz',
                                  'wb') as in_pos_f:
        for index_1, index_2 in indices_in_nested_list:
            jumble = index_1 < 0
            turn, target = paragraphs[index_1][index_2]
            turns = [turn]
            if jumble:
                contexts = [sample(target, len(target))]
            else:
                contexts = [target]
            for index_3 in range(index_2):
                turn, context = paragraphs[index_1][index_3]
                turns.append(turn)
                contexts.append(context)
            tar_lex, tar_pos = zip(*target)
            tar_lex = lex_tokeniser.texts_to_sequences([list(tar_lex)
                                                        ])[0] + lex_end
            tar_lex = pad_list(tar_lex, seq_pad_len)
            tar_pos = pos_tokeniser.texts_to_sequences([list(tar_pos)
                                                        ])[0] + pos_end
            tar_pos = pad_list(tar_pos, seq_pad_len)
            write_ = delimitter.join([
                ' '.join(str(turn) for turn in turns),
                ' '.join([str(lex_) for lex_ in tar_lex]),
                ' '.join([str(pos_) for pos_ in tar_pos]),
                str(-1 if jumble else 1)
            ])
            tar_f.write((write_ + '\n').encode(encoding=encoding))
            in_lex, in_pos = zip(*[tuple(zip(*sent)) for sent in contexts])
            in_lex = list(in_lex)
            in_pos = list(in_pos)
            for count in range(len(in_lex)):
                in_lex[count] = lex_tokeniser.texts_to_sequences(
                    [list(in_lex[count])])[0]
                in_pos[count] = pos_tokeniser.texts_to_sequences(
                    [list(in_pos[count])])[0]
                if count != 0:
                    in_lex[count] += lex_end
                    in_pos[count] += pos_end
                in_lex[count] = lex_start + in_lex[count]
                in_lex[count] = pad_list(in_lex[count], seq_pad_len)
                in_lex[count] = ' '.join(str(lex_) for lex_ in in_lex[count])

                in_pos[count] = pos_start + in_pos[count]
                in_pos[count] = pad_list(in_pos[count], seq_pad_len)
                in_pos[count] = ' '.join(str(pos_) for pos_ in in_pos[count])
            write_ = delimitter.join(in_lex)
            in_lex_f.write((write_ + '\n').encode(encoding=encoding))
            write_ = delimitter.join(in_pos)
            in_pos_f.write((write_ + '\n').encode(encoding=encoding))
    return

Beispiel #11

0

Datei anzeigen

Datei: make_data.py Projekt: E-Loba/language-model

def make_mlm_inputs(sentences_raw):
    indices = list(range(len(sentences_raw)))
    shuffle(indices)

    with open(pos_tok_name, "rb") as dict_file:
        pos_tokeniser = pickle.load(dict_file)
    print("loaded POS tokeniser")
    with open(lex_tok_name, "rb") as dict_file:
        lex_tokeniser = pickle.load(dict_file)
    print("loaded lexical tokeniser")
    pos_start = pos_tokeniser.num_words + 1
    pos_end = pos_tokeniser.num_words + 2
    pos_mask = 1
    lex_start = lex_tokeniser.num_words + 1
    lex_end = lex_tokeniser.num_words + 2
    lex_mask = 1

    valid_inputs_count = 0
    with zopen(data_name + "_mlm_lex_inputs.gz", "wb") as lex_f, zopen(
            data_name + "_mlm_pos_inputs.gz",
            "wb") as pos_f, zopen(data_name + "_mlm_targets.gz",
                                  "wb") as tar_f:
        for index in indices:
            sentence = sentences_raw[index]
            if len(sentence) > max_sent_len:
                continue
            valid_inputs_count += 1
            _lex, _pos = zip(*sentence)
            _lex = list(_lex)
            _pos = list(_pos)
            _lex = lex_tokeniser.texts_to_sequences([_lex])[0]
            _pos = pos_tokeniser.texts_to_sequences([_pos])[0]
            mask = [random() for _ in _lex]
            inp_lx = []
            inp_ps = []
            chance = random() < 0.4
            label = ' '.join(str(tkn) for tkn in make_one_hot(int(chance), 3))
            if chance:
                inp_lx, inp_ps = zip(*sample(list(zip(_lex, _pos)), len(_lex)))
                inp_lx = list(inp_lx)
                inp_ps = list(inp_ps)
            else:
                for i, pr in enumerate(mask):
                    if pr > 0.15:
                        inp_lx.append(_lex[i])
                        inp_ps.append(_pos[i])
                    else:
                        inp_lx.append(lex_mask)
                        inp_ps.append(pos_mask)
            inp_lx = pad_list([lex_start] + inp_lx + [lex_end],
                              seq_pad_len)  # list of tokens
            inp_lx = (pad_list([lex_start] + _lex, seq_pad_len), inp_lx)
            inp_ps = pad_list([pos_start] + inp_ps + [pos_end], seq_pad_len)
            inp_ps = (pad_list([pos_start] + _pos, seq_pad_len), inp_ps)
            tar_lx = pad_list(_lex + [lex_end], seq_pad_len)
            tar_ps = pad_list(_pos + [pos_end], seq_pad_len)
            write_ = delimitter.join(
                [' '.join([str(tkn) for tkn in turn]) for turn in inp_lx])
            # print(write_)
            lex_f.write((write_ + '\n').encode(encoding=encoding))
            write_ = delimitter.join(
                [' '.join([str(tkn) for tkn in turn]) for turn in inp_ps])
            # print(write_)
            pos_f.write((write_ + '\n').encode(encoding=encoding))
            write_ = delimitter.join([
                "0 1", ' '.join([str(tkn) for tkn in tar_lx]),
                ' '.join([str(tkn) for tkn in tar_ps]), label
            ])
            # print(write_)
            tar_f.write((write_ + '\n').encode(encoding=encoding))
    print(valid_inputs_count)
    return

Beispiel #12

0

Datei anzeigen

    if fsys: wildCard += 'fsys-'
    wildCard += thismod[0] + '-Results.pkl.gz'
    fl = sorted(glob(wildCard))

print fl

vrList = []
vsiniList = []
chiList = []
fwhmList = []
TList = []
gList = []
alphaList = []

for f in fl:
    fp = zopen(f, 'r')
    db = load(fp)
    fp.close()
    b = db['burn']
    if b < 1000: b = 1000
    vrList += [db['vr'][b:]]
    vsiniList += [db['vsini'][b:]]
    chiList += [db['chisq'][b:] / db['dof']]
    fwhmList += [db['FWHM'][b:] * 1000.]
    TList += [db['T'][b:]]
    gList += [db['logg'][b:]]
    alphaList += [db['alpha'][b:]]

clf()
boxplot(vrList, whis='range')
xticks(arange(7) + 1, ('32', '33', '34', '35', '36', '37', '38'))