def crop_scale_save(self,files,o_size,o_path): utils.assert_dir(o_path) #chk = os.path.isdir(o_path) #if chk == False: # cmd = "mkdir -p "+o_path # os.system("cmd") splitter=utils.split_seq(files,4) split_indices=splitter.get_indices() print split_indices p1=Process(target=self.css_parallel,args=(files[0:split_indices[1]] ,o_size,o_path,0)) p2=Process(target=self.css_parallel,args=(files[split_indices[1]:split_indices[3]],o_size,o_path,split_indices[1])) p3=Process(target=self.css_parallel,args=(files[split_indices[2]:split_indices[3]],o_size,o_path,split_indices[2])) p4=Process(target=self.css_parallel,args=(files[split_indices[3]:-1] ,o_size,o_path,split_indices[3])) p1.start() p2.start() p3.start() p4.start() p1.join() p2.join() p3.join() p4.join()
def Run_parallel(self, num_proc=1): splitter = utils.split_seq(self.in_path, num_proc) index = splitter.get_indices() for i in range(len(index)): if i + 1 == len(index): paths = self.in_path[index[i] : len(self.in_path)] else: paths = self.in_path[index[i] : (index[i + 1] + 1)] ctr = index[i] Process(target=self.pre_process, args=(paths, ctr)).start()
def run(D): print 'in script: ', os.path.basename(__file__) print 'dict:' for k in D: if k == 'seq': print ' ', k, '=', D[k][:25] else: print ' ', k, '=', D[k] # for now, assuming it is a sequence and not filename seq = D['seq'] if seq.startswith('>') or seq.startswith('%3E'): title, seq = ut.split_seq(seq) result = find_sites(seq) return result
def generate_sequences(seqs, winlen, step, nfold): bg_gc_list = [] bg_lengths = [] for record in seqs: seq = record.seq.__str__() for n in range(0, nfold): new_sequence = "" for sequence in split_seq(seq): if re.match("N", sequence): new_sequence += sequence elif sequence: new_sequence += shuffle_window(sequence, winlen, step) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_for_{}".format(record.name), description="") print(new_seq.format("fasta"), end="") bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) return bg_gc_list, bg_lengths
def prepare_tf_record_data(tokenizer, max_seq_len, label2id, path, out_path): """ 生成训练数据, tf.record, 单标签分类模型, 随机打乱数据 """ def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return f writer = tf.io.TFRecordWriter(out_path) example_count = 0 for line in open(path): if not line.strip(): continue _ = json.loads(line.strip()) len_ = len(_["text"]) labels = ["O"] * len_ for k, v in _["label"].items(): for kk, vv in v.items(): for vvv in vv: span = vvv s = span[0] e = span[1] + 1 # print(s, e) if e - s == 1: labels[s] = "S_" + k else: labels[s] = "B_" + k for i in range(s + 1, e - 1): labels[i] = "M_" + k labels[e - 1] = "E_" + k # print() # feature = process_one_example(tokenizer, label2id, row[column_name_x1], row[column_name_y], # max_seq_len=max_seq_len) xs, ys = split_seq(list(_["text"]), max_seq_len - 2, labels) for xx, yy in zip(xs, ys): feature = process_one_example(tokenizer, label2id, xx, yy, max_seq_len=max_seq_len) features = collections.OrderedDict() # 序列标注任务 features["input_ids"] = create_int_feature(feature[0]) features["input_mask"] = create_int_feature(feature[1]) features["segment_ids"] = create_int_feature(feature[2]) features["label_ids"] = create_int_feature(feature[3]) if example_count < 5: print("*** Example ***") print(_["text"]) print(_["label"]) print("input_ids: %s" % " ".join([str(x) for x in feature[0]])) print("input_mask: %s" % " ".join([str(x) for x in feature[1]])) print("segment_ids: %s" % " ".join([str(x) for x in feature[2]])) print("label: %s " % " ".join([str(x) for x in feature[3]])) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString()) example_count += 1 # if example_count == 20: # break if example_count % 3000 == 0: print(example_count) print("total example:", example_count) writer.close()
def test(): data = ut.load_data('SThemA.txt') title,seq = ut.split_seq(data) print pretty_fmt(seq)