Esempio n. 1
0
    def crop_scale_save(self,files,o_size,o_path):
        
        utils.assert_dir(o_path)
        #chk = os.path.isdir(o_path)
        #if chk == False:
        #    cmd = "mkdir -p  "+o_path
        #    os.system("cmd")

        splitter=utils.split_seq(files,4)
        split_indices=splitter.get_indices()
        print split_indices
        p1=Process(target=self.css_parallel,args=(files[0:split_indices[1]]          ,o_size,o_path,0))
        p2=Process(target=self.css_parallel,args=(files[split_indices[1]:split_indices[3]],o_size,o_path,split_indices[1]))
        p3=Process(target=self.css_parallel,args=(files[split_indices[2]:split_indices[3]],o_size,o_path,split_indices[2]))
        p4=Process(target=self.css_parallel,args=(files[split_indices[3]:-1]          ,o_size,o_path,split_indices[3]))
        
        p1.start()
        p2.start()
        p3.start()
        p4.start()
        
        p1.join()
        p2.join()
        p3.join()
        p4.join()
Esempio n. 2
0
    def Run_parallel(self, num_proc=1):
        splitter = utils.split_seq(self.in_path, num_proc)
        index = splitter.get_indices()

        for i in range(len(index)):
            if i + 1 == len(index):
                paths = self.in_path[index[i] : len(self.in_path)]
            else:
                paths = self.in_path[index[i] : (index[i + 1] + 1)]

            ctr = index[i]
            Process(target=self.pre_process, args=(paths, ctr)).start()
Esempio n. 3
0
def run(D):
    print 'in script: ', os.path.basename(__file__)
    print 'dict:'
    for k in D:
        if k == 'seq':
            print '  ', k, '=', D[k][:25]
        else:
            print '  ', k, '=', D[k]
    # for now, assuming it is a sequence and not filename
    seq = D['seq']
    if seq.startswith('>') or seq.startswith('%3E'):
        title, seq = ut.split_seq(seq)
    result = find_sites(seq)
    return result
Esempio n. 4
0
def generate_sequences(seqs, winlen, step, nfold):
    bg_gc_list = []
    bg_lengths = []
    for record in seqs:
        seq = record.seq.__str__()
        for n in range(0, nfold):
            new_sequence = ""
            for sequence in split_seq(seq):
                if re.match("N", sequence):
                    new_sequence += sequence
                elif sequence:
                    new_sequence += shuffle_window(sequence, winlen, step)
            new_seq = SeqRecord(Seq(new_sequence, generic_dna),
                                id="background_seq_for_{}".format(record.name),
                                description="")
            print(new_seq.format("fasta"), end="")
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
    return bg_gc_list, bg_lengths
Esempio n. 5
0
def prepare_tf_record_data(tokenizer, max_seq_len, label2id, path, out_path):
    """
        生成训练数据, tf.record, 单标签分类模型, 随机打乱数据
    """

    def create_int_feature(values):
        f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
        return f

    writer = tf.io.TFRecordWriter(out_path)
    example_count = 0

    for line in open(path):
        if not line.strip():
            continue
        _ = json.loads(line.strip())
        len_ = len(_["text"])
        labels = ["O"] * len_
        for k, v in _["label"].items():
            for kk, vv in v.items():
                for vvv in vv:
                    span = vvv
                    s = span[0]
                    e = span[1] + 1
                    # print(s, e)
                    if e - s == 1:
                        labels[s] = "S_" + k
                    else:
                        labels[s] = "B_" + k
                        for i in range(s + 1, e - 1):
                            labels[i] = "M_" + k
                        labels[e - 1] = "E_" + k
            # print()
        # feature = process_one_example(tokenizer, label2id, row[column_name_x1], row[column_name_y],
        #                               max_seq_len=max_seq_len)
        xs, ys = split_seq(list(_["text"]), max_seq_len - 2, labels)

        for xx, yy in zip(xs, ys):
            feature = process_one_example(tokenizer, label2id, xx, yy, max_seq_len=max_seq_len)

            features = collections.OrderedDict()
            # 序列标注任务
            features["input_ids"] = create_int_feature(feature[0])
            features["input_mask"] = create_int_feature(feature[1])
            features["segment_ids"] = create_int_feature(feature[2])
            features["label_ids"] = create_int_feature(feature[3])
            if example_count < 5:
                print("*** Example ***")
                print(_["text"])
                print(_["label"])
                print("input_ids: %s" % " ".join([str(x) for x in feature[0]]))
                print("input_mask: %s" % " ".join([str(x) for x in feature[1]]))
                print("segment_ids: %s" % " ".join([str(x) for x in feature[2]]))
                print("label: %s " % " ".join([str(x) for x in feature[3]]))

            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
            writer.write(tf_example.SerializeToString())
            example_count += 1

            # if example_count == 20:
            #     break
            if example_count % 3000 == 0:
                print(example_count)
    print("total example:", example_count)
    writer.close()
Esempio n. 6
0
def test():
    data = ut.load_data('SThemA.txt')
    title,seq = ut.split_seq(data)
    print pretty_fmt(seq)