Esempio n. 1
0
def main():
    prior = '/home/yu/max/research/'
    reads_file = prior + 'data/reads.20k.rc.fasta'
    creads_file = prior + 'data/temp_creads.outrx_27_6_rc_v2.out'
    ktmer_headers_file = prior + 'data/temp_ktmer_headersrx_27_6_rc_v2.out'

    hr, rr = rf.read_fasta(reads_file)
    headers = itec4.build_headers_dict(ktmer_headers_file)
    creads = itec4.build_creads_dict(creads_file, hr, rr)
    for i in range(len(hr)):
        hr[i] = hr[i].split()[0]

    out_fold = '/home/yu/max/research/6.8.15_nhoods/'
    if not os.path.exists(out_fold):
        os.makedirs(out_fold)

    for i in [s for s in range(len(hr)) if s % 2 == 1]:
        print i
        header = hr[i]
        nhood = get_special_1_deg_nhood(header, creads, headers, hr, rr)

        base_file = str(i) + '_base.fasta'
        hood_file = str(i) + '_hood.fasta'
        with open(base_file, 'w') as f:
            f.write(header + '\n' + rr[i])
        if len(nhood) != 0:
            with open(hood_file, 'w') as f:
                f.write('\n'.join(nhood))

        commands.getstatusoutput('mv ' + base_file + ' ' + out_fold)
        commands.getstatusoutput('mv ' + hood_file + ' ' + out_fold)
    return
Esempio n. 2
0
def main():
  prior = '/home/yu/max/research/'
  reads_file = prior + 'data/reads.20k.rc.fasta'
  creads_file = prior + 'data/temp_creads.outrx_27_6_rc_v2.out'
  ktmer_headers_file = prior + 'data/temp_ktmer_headersrx_27_6_rc_v2.out'

  hr, rr = rf.read_fasta(reads_file)
  headers = itec4.build_headers_dict(ktmer_headers_file)
  creads = itec4.build_creads_dict(creads_file, hr, rr)
  for i in range(len(hr)):
    hr[i] = hr[i].split()[0]

  out_fold = '/home/yu/max/research/6.8.15_nhoods/'
  if not os.path.exists(out_fold):
    os.makedirs(out_fold)
    
  for i in [s for s in range(len(hr)) if s % 2 == 1]:
    print i
    header = hr[i]
    nhood = get_special_1_deg_nhood(header, creads, headers, hr, rr)

    base_file = str(i) + '_base.fasta'
    hood_file = str(i) + '_hood.fasta'
    with open(base_file, 'w') as f:
      f.write(header + '\n' + rr[i])
    if len(nhood) != 0:
      with open(hood_file, 'w') as f:
        f.write('\n'.join(nhood))

    commands.getstatusoutput('mv ' + base_file + ' ' + out_fold)
    commands.getstatusoutput('mv ' + hood_file + ' ' + out_fold)
  return
def convert_creads_to_nhoods(reads_file, creads_file, ktmer_headers_file):
  out_file = '/' + '/'.join(creads_file.split('/')[:-1]) + '/nhoods_' + creads_file.split('/')[-1]
  creads = itec4.build_creads_dict(creads_file, reads_file)
  headers = itec4.build_headers_dict(ktmer_headers_file)
  hr, rr = rf.read_fasta(reads_file)
  for i in range(len(hr)):
    hr[i] = hr[i].split()[0]

  new_text = ''
  for i in range(len(hr)):
    if i % 500 == 0:
      print i, datetime.datetime.now()
    header = hr[i]
    nh = itec4.get_1_deg_nhood(header, creads, headers)
    new_text += str(i) + ' '
    neighbors_indices = []
    for neighbor_header in nh:
      neighbors_indices.append(hr.index(neighbor_header))
    new_text += ' '.join([str(s) for s in neighbors_indices])
    new_text += '\n'

  with open(out_file, 'w') as f:
    f.write(new_text)

  return
def convert_creads_to_nhoods(reads_file, creads_file, ktmer_headers_file):
    out_file = '/' + '/'.join(
        creads_file.split('/')[:-1]) + '/nhoods_' + creads_file.split('/')[-1]
    creads = itec4.build_creads_dict(creads_file, reads_file)
    headers = itec4.build_headers_dict(ktmer_headers_file)
    hr, rr = rf.read_fasta(reads_file)
    for i in range(len(hr)):
        hr[i] = hr[i].split()[0]

    new_text = ''
    for i in range(len(hr)):
        if i % 500 == 0:
            print i, datetime.datetime.now()
        header = hr[i]
        nh = itec4.get_1_deg_nhood(header, creads, headers)
        new_text += str(i) + ' '
        neighbors_indices = []
        for neighbor_header in nh:
            neighbors_indices.append(hr.index(neighbor_header))
        new_text += ' '.join([str(s) for s in neighbors_indices])
        new_text += '\n'

    with open(out_file, 'w') as f:
        f.write(new_text)

    return
def main():
  header = '>' + sys.argv[1]
  e_coli_genome = '/home/mshen/research/data/e_coli_genome.fasta'
  # ec_tool = '/home/mshen/research/bin/error_correction_1218.sh'
  # reads_file = '/home/mshen/research/data/PacBioCLR/PacBio_10kb_CLR_mapped_removed_homopolymers.fasta'
  # creads_file = '/home/mshen/research/data/22.4_creads.out'
  # ktmer_headers_file = '/home/mshen/research/data/22.4_ktmer_headers.out'
  blasr_exe = '/home/jeyuan/blasr/alignment/bin/blasr'
  blasr_options = '-bestn 1 -m 1'   # Concise output
  temp_sig = str(datetime.datetime.now()).split()[1]

  # New dataset
  ec_tool = '/home/lin/program/error_correction_5X_0204.sh'
  # reads_file = '/home/mchaisso/datasets/pacbio_ecoli/reads.20k.fasta'
  reads_file = '/home/mshen/research/data/reads.20k.rc.fasta'
  # creads_file = '/home/mshen/research/data/22.8_creads_20k.out'
  # ktmer_headers_file = '/home/mshen/research/data/22.8_ktmer_headers_20k.out'
  creads_file = '/home/mshen/research/data/temp_creads.out_28_6_rc.out'
  ktmer_headers_file = '/home/mshen/research/data/temp_ktmer_headers_28_6_rc.out'

  creads = itec4.build_creads_dict(creads_file, reads_file)
  headers = itec4.build_headers_dict(ktmer_headers_file)
  hr, rr = rf.read_fasta(reads_file)
  # Compensate for new dataset
  for i in range(len(hr)):
    hr[i] = hr[i].split()[0]

  con = itec4.error_correct(ec_tool, header, headers, creads, hr, rr, temp_sig_out = temp_sig)
  if len(con) == 0:
    print 'FAILURE IN ERROR CORRECTION'
    sys.exit(0)

  return

  temp_file = 'temp_cfh_' + temp_sig + '.fasta'
  temp2_file = 'temp_cfh2_' + temp_sig + '.fasta'
  with open(temp_file, 'w') as f:
    f.write(header + '\n' + con)

  status = commands.getstatusoutput(blasr_exe + ' ' + temp_file + ' ' + e_coli_genome + ' ' + blasr_options)[1]
  if len(status) != 0:
    print status

  collected_h = set()
  ktmers = []
  if header not in creads or len(creads[header]) == 1:
    pass
  for i in range(len(creads[header])):
    if i % 2 == 1:
      ktmers.append(creads[header][i])
  for kt in ktmers:
    for h in headers[kt]:
      collected_h.add(h)

  to_con = []
  to_gen = []
  for ch in collected_h:
    with open(temp2_file, 'w') as f:
      f.write(ch + '\n' + rr[hr.index(ch)])
    status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + temp_file + ' ' + blasr_options)[1]
    to_con.append(status)
    status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' + e_coli_genome + ' ' + blasr_options)[1]
    to_gen.append(status)

  print sum([1 for s in to_con if len(s) > 0]), 'used in consensus out of', len(to_con)
  for tg in to_gen:
    print tg
# Finds data on the number of kt-mers per read

import sys
import itec4
import read_fasta as rf

reads_file = '/home/mshen/research/data/reads.20k.rc.fasta'
creads_file = '/home/mshen/research/data/20k_v2/temp_creads.outrx_27_6_rc_v2.out'
hr, rr = rf.read_fasta(reads_file)
creads_dict = itec4.build_creads_dict(creads_file, hr, rr)
for i in range(len(hr)):
    hr[i] = hr[i].split()[0]
for i in range(20):
    print hr[i], len(creads_dict[hr[i]]) - 1 / 2, len(rr[i])

sys.exit(0)

data = []
for i in range(len(creads_dict.keys())):
    k = creads_dict.keys()[i]
    data.append((len(creads_dict[k]) - 1) / 2)
    if i % 5000 == 0:
        print i

print '\n'.join([str(s) for s in data])
print float(sum(data)) / float(len(data))
Esempio n. 7
0
import itec4

prior = '/home/mshen/research/'
reads_fn = prior + 'data/reads.20k.rc.fasta'
creads_fn = prior + 'data/20k_v2/temp_creads.outrx_27_6_rc_v2.out'
ktmer_headers_fn = prior + 'data/20k_v2/temp_ktmer_headersrx_27_6_rc_v2.out'
genome_fn = prior + 'data/ecoli_consensus_mark.fasta'

bt_fraction = 0.01    # If the current best score is less than fraction * best last score, backtrack
forget_cutoff = 50000
traversed = []
score_history = ([0], [0])    # First is score history, second is num_candidates history

hr, rr = ml.read_fasta(reads_fn)
headers = itec4.build_headers_dict(ktmer_headers_fn)
creads = itec4.build_creads_dict(creads_fn, hr, rr)
for i in range(len(hr)):
  hr[i] = hr[i].split()[0]

def main():
  print 'Reads File:', reads_fn, '\ncreads File:', creads_fn, '\nktmer Headers File:', \
    ktmer_headers_fn

  afa(reads_fn, ktmer_headers_fn, creads_fn)
  return

def afa(reads_fn, ktmer_headers_fn, creads_fn):
  gh, gr = ml.read_fasta(genome_fn)
  gr = gr[0]

  ktmers = headers.keys()
Esempio n. 8
0
prior = '/home/mshen/research/'
reads_fn = prior + 'data/reads.20k.rc.fasta'
creads_fn = prior + 'data/20k_v2/temp_creads.outrx_27_6_rc_v2.out'
ktmer_headers_fn = prior + 'data/20k_v2/temp_ktmer_headersrx_27_6_rc_v2.out'
genome_fn = prior + 'data/ecoli_consensus_mark.fasta'

bt_fraction = 0.01  # If the current best score is less than fraction * best last score, backtrack
forget_cutoff = 50000
traversed = []
score_history = ([0], [0]
                 )  # First is score history, second is num_candidates history

hr, rr = ml.read_fasta(reads_fn)
headers = itec4.build_headers_dict(ktmer_headers_fn)
creads = itec4.build_creads_dict(creads_fn, hr, rr)
for i in range(len(hr)):
    hr[i] = hr[i].split()[0]


def main():
    print 'Reads File:', reads_fn, '\ncreads File:', creads_fn, '\nktmer Headers File:', \
      ktmer_headers_fn

    afa(reads_fn, ktmer_headers_fn, creads_fn)
    return


def afa(reads_fn, ktmer_headers_fn, creads_fn):
    gh, gr = ml.read_fasta(genome_fn)
    gr = gr[0]
def main():
    header = '>' + sys.argv[1]
    e_coli_genome = '/home/mshen/research/data/e_coli_genome.fasta'
    # ec_tool = '/home/mshen/research/bin/error_correction_1218.sh'
    # reads_file = '/home/mshen/research/data/PacBioCLR/PacBio_10kb_CLR_mapped_removed_homopolymers.fasta'
    # creads_file = '/home/mshen/research/data/22.4_creads.out'
    # ktmer_headers_file = '/home/mshen/research/data/22.4_ktmer_headers.out'
    blasr_exe = '/home/jeyuan/blasr/alignment/bin/blasr'
    blasr_options = '-bestn 1 -m 1'  # Concise output
    temp_sig = str(datetime.datetime.now()).split()[1]

    # New dataset
    ec_tool = '/home/lin/program/error_correction_5X_0204.sh'
    # reads_file = '/home/mchaisso/datasets/pacbio_ecoli/reads.20k.fasta'
    reads_file = '/home/mshen/research/data/reads.20k.rc.fasta'
    # creads_file = '/home/mshen/research/data/22.8_creads_20k.out'
    # ktmer_headers_file = '/home/mshen/research/data/22.8_ktmer_headers_20k.out'
    creads_file = '/home/mshen/research/data/temp_creads.out_28_6_rc.out'
    ktmer_headers_file = '/home/mshen/research/data/temp_ktmer_headers_28_6_rc.out'

    creads = itec4.build_creads_dict(creads_file, reads_file)
    headers = itec4.build_headers_dict(ktmer_headers_file)
    hr, rr = rf.read_fasta(reads_file)
    # Compensate for new dataset
    for i in range(len(hr)):
        hr[i] = hr[i].split()[0]

    con = itec4.error_correct(ec_tool,
                              header,
                              headers,
                              creads,
                              hr,
                              rr,
                              temp_sig_out=temp_sig)
    if len(con) == 0:
        print 'FAILURE IN ERROR CORRECTION'
        sys.exit(0)

    return

    temp_file = 'temp_cfh_' + temp_sig + '.fasta'
    temp2_file = 'temp_cfh2_' + temp_sig + '.fasta'
    with open(temp_file, 'w') as f:
        f.write(header + '\n' + con)

    status = commands.getstatusoutput(blasr_exe + ' ' + temp_file + ' ' +
                                      e_coli_genome + ' ' + blasr_options)[1]
    if len(status) != 0:
        print status

    collected_h = set()
    ktmers = []
    if header not in creads or len(creads[header]) == 1:
        pass
    for i in range(len(creads[header])):
        if i % 2 == 1:
            ktmers.append(creads[header][i])
    for kt in ktmers:
        for h in headers[kt]:
            collected_h.add(h)

    to_con = []
    to_gen = []
    for ch in collected_h:
        with open(temp2_file, 'w') as f:
            f.write(ch + '\n' + rr[hr.index(ch)])
        status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' +
                                          temp_file + ' ' + blasr_options)[1]
        to_con.append(status)
        status = commands.getstatusoutput(blasr_exe + ' ' + temp2_file + ' ' +
                                          e_coli_genome + ' ' +
                                          blasr_options)[1]
        to_gen.append(status)

    print sum([1 for s in to_con
               if len(s) > 0]), 'used in consensus out of', len(to_con)
    for tg in to_gen:
        print tg