Ejemplo n.º 1
0
def ANGLE_training(cds_filename, utr_filename, output_pickle, num_workers=3):
    coding = [r for r in SeqIO.parse(open(cds_filename), 'fasta')]
    utr = [r for r in SeqIO.parse(open(utr_filename), 'fasta')]

    o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, coding)
    add_to_background(o_all, utr)

    data_pos = get_data_parallel(o_all, coding, [0], num_workers)
    data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers)

    data = data_neg + data_pos
    target = [0] * len(data_neg) + [1] * len(data_pos)
    data = np.array(data)

    print >> sys.stderr, "data prep done, running classifier...."
    bdt = AdaBoostClassifier(n_estimators=50)
    bdt.fit(data, target)

    print >> sys.stderr, "classifier trained. putting pickle to", output_pickle

    with open(output_pickle, 'w') as f:
        dump(bdt, f)

    return data, target, bdt
Ejemplo n.º 2
0
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3):
    coding = [ r for r in SeqIO.parse(open(cds_filename), 'fasta') ]
    utr = [ r for r in SeqIO.parse(open(utr_filename), 'fasta') ]

    o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, coding)
    add_to_background(o_all, utr)

    # Queue is very inefficient for large data passing
    # instead break the records up into chunk sizes and just combine results together
    print >> sys.stderr, "running get_data_parallel for coding, chunk 0"
    data_pos = get_data_parallel(o_all, coding, [0], num_workers)
    data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers)
#    num_coding = len(coding)
#    data_pos = get_data_parallel(o_all, coding[:MAX_RECORD_CHUNK], [0], num_workers)
#    for i in xrange(1, num_coding/MAX_RECORD_CHUNK + (num_coding%MAX_RECORD_CHUNK>0)):
#        print >> sys.stderr, "running get_data_parallel for coding, chunk", i
#        data_pos += get_data_parallel(o_all, coding[i*MAX_RECORD_CHUNK:(i+1)*MAX_RECORD_CHUNK], [0], num_workers)##
#
#    print >> sys.stderr, "running get_data_parallel for UTR, chunk 0"
#    num_utr = len(utr)
#    data_neg = get_data_parallel(o_all, utr[:MAX_RECORD_CHUNK], [0, 1, 2], num_workers)
#    for i in xrange(1, num_utr/MAX_RECORD_CHUNK + (num_utr%MAX_RECORD_CHUNK>0)):
#        print >> sys.stderr, "running get_data_parallel for UTR, chunk", i
#        data_neg += get_data_parallel(o_all, utr[i*MAX_RECORD_CHUNK:(i+1)*MAX_RECORD_CHUNK], [0, 1, 2], num_workers)

    print >> sys.stderr, "size of neg training data: {0}, pos training data: {1}".format(\
        len(data_neg), len(data_pos))

    print >> sys.stderr, "using first 10,000 training pos/neg only"
    data_neg = data_neg[:10000]
    data_pos = data_pos[:10000]
    data = data_neg + data_pos

    target = [0]*len(data_neg) + [1]*len(data_pos)
    data = np.array(data)

    print >> sys.stderr, "data prep done, running classifier...."
    bdt = AdaBoostClassifier(n_estimators=50)
    bdt.fit(data, target)

    print >> sys.stderr, "classifier trained. putting pickle to", output_pickle

    with open(output_pickle, 'wb') as f:
        dump({'bdt':bdt, 'o_all':o_all}, f)

    return data, target, bdt
Ejemplo n.º 3
0
def make_data_smart(seq, pseudo, window_size=96, step_size=3, frame_shift=0):
    seq = seq.upper()
    ss = str(seq[frame_shift:])
    a, b = len(ss) / 3, len(ss) % 3
    aa_seq_end = a * 3 + frame_shift if frame_shift <= b else frame_shift - 3
    aa = str(seq[frame_shift:aa_seq_end].translate())
    aa_window_size = window_size / 3
    n = len(ss)

    o = sp2.CDSWindowFeat()
    cur_a = aa[:aa_window_size]
    cur_s = ss[:window_size]
    o.calc_amino_count(cur_a)
    o.calc_diamino_count(cur_a, len(cur_a))
    o.calc_codon_count(cur_s, len(cur_s))
    aa_freq = o.get_amino_freq(pseudo, .0001)
    di_freq = o.get_diamino_freq(pseudo, .0001)
    codon_freq = o.get_codon_freq(pseudo, .0001)
    arr = make_amino_scores(aa_freq) + make_diamino_scores(
        di_freq, o.diamino_range) + make_codon_scores(aa_freq, codon_freq)
    data = [arr]
    for i in xrange(1, len(aa) - aa_window_size):
        # s advances by 3, now at ss[i*3:i*3+window_size]
        # a advances by 1, now at aa[i:i+aa_window_size]
        o.calc_amino_count(cur_a[0], -1)
        o.calc_amino_count(aa[i + aa_window_size - 1], 1)
        o.calc_codon_count(cur_s[:3], 3, -1)
        o.calc_codon_count(ss[(i - 1) * 3 + window_size:(i) * 3 + window_size],
                           3, 1)
        o.diamino_changed = {}  # must clear out THIS!
        o.deduct_diamino_count(cur_a, len(cur_a), 1)
        cur_a = aa[i:i + aa_window_size]
        o.add_diamino_count(cur_a, len(cur_a), 1)
        cur_s = ss[i * 3:i * 3 + window_size]
        aa_freq = o.get_amino_freq(pseudo, .0001)
        di_freq = o.get_diamino_freq(pseudo, .0001, False)
        codon_freq = o.get_codon_freq(pseudo, .0001)
        arr = make_amino_scores(aa_freq) + make_diamino_scores(
            di_freq, o.diamino_range) + make_codon_scores(aa_freq, codon_freq)
        data.append(arr)
        #if i > 5: break
    return data
Ejemplo n.º 4
0
def distribute_ANGLE_predict(fasta_filename,
                             output_prefix,
                             bdt_pickle_filename,
                             num_workers=5,
                             min_ANGLE_aa_length=50,
                             min_dumb_aa_length=100,
                             use_rev_strand=True):
    tmpdir = "ANGLE.tmp." + str(int(time.time()))
    os.makedirs(tmpdir)

    print >> sys.stderr, "Reading classifer pickle:", bdt_pickle_filename
    with open(bdt_pickle_filename) as f:
        bdt = load(f)

    print >> sys.stderr, "Generating background frequencies...."
    o_all = c_ORFscores.CDSWindowFeat()
    add_to_background(o_all, SeqIO.parse(open(fasta_filename), 'fasta'))

    print >> sys.stderr, "Splitting input into chunks for parallelization...."
    handles = [
        open(os.path.join(tmpdir, output_prefix + '.split_' + str(i) + '.fa'),
             'w') for i in xrange(num_workers)
    ]
    total_seqs = 0
    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        total_seqs += 1
    num_seqs_per_worker = total_seqs / num_workers + 1
    i = 0
    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        f = handles[i / num_seqs_per_worker]
        f.write(">{0}\n{1}\n".format(r.id, r.seq))
        i += 1
    for f in handles:
        f.close()

    list_of_fasta = [f.name for f in handles]

    n = ((i / num_workers + 1) / 10) * 10
    workers = []
    for i, input_fasta in enumerate(list_of_fasta):
        print >> sys.stderr, "Pool worker for", input_fasta
        starting_index = i * n + 1
        p = Process(target=ANGLE_predict_worker,
                    args=(input_fasta, input_fasta + '.ANGLE', bdt, o_all,
                          min_ANGLE_aa_length, min_dumb_aa_length,
                          use_rev_strand, starting_index))
        p.start()
        workers.append(p)

    for p in workers:
        print >> sys.stderr, "waiting for worker", p.name
        p.join()

    cmd = "cat {0} > {1}.ANGLE.cds".format(
        " ".join(x + '.ANGLE.cds' for x in list_of_fasta), output_prefix)
    if subprocess.check_call(cmd, shell=True) != 0:
        print >> sys.stderr, "Trouble running command", cmd
        sys.exit(-1)
    cmd = "cat {0} > {1}.ANGLE.pep".format(
        " ".join(x + '.ANGLE.pep' for x in list_of_fasta), output_prefix)
    if subprocess.check_call(cmd, shell=True) != 0:
        print >> sys.stderr, "Trouble running command", cmd
        sys.exit(-1)
    cmd = "cat {0} > {1}.ANGLE.utr".format(
        " ".join(x + '.ANGLE.utr' for x in list_of_fasta), output_prefix)
    if subprocess.check_call(cmd, shell=True) != 0:
        print >> sys.stderr, "Trouble running command", cmd
        sys.exit(-1)

    print >> sys.stderr, "Output written to {0}.ANGLE.cds, {0}.ANGLE.pep, {0}.ANGLE.utr".format(
        output_prefix)

    for x in list_of_fasta:
        os.remove(x)
        os.remove(x + '.ANGLE.cds')
        os.remove(x + '.ANGLE.pep')
        os.remove(x + '.ANGLE.utr')

    os.removedirs(tmpdir)