Beispiel #1
0
def split(inp_fn, out_nm):
    inp_fn_numlines = util.line_count(inp_fn)

    num_splits = 60
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    # print 'Using split size %s' % (split_size)

    split_num = 0
    timer = util.Timer(total=num_splits)
    for idx in range(1, inp_fn_numlines, split_size):
        start = idx
        end = start + split_size
        out_fn = out_dir + out_nm + '_%s.fq' % (split_num)

        skip = False
        if os.path.isfile(out_fn):
            size_mb = os.path.getsize(out_fn) / 1e6
            if size_mb > 0:
                skip = True

        if not skip:
            command = 'tail -n +%s %s | head -n %s > %s' % (
                start, inp_fn, end - start, out_fn)
            subprocess.check_output(command, shell=True)

        split_num += 1
        # print(command)
        timer.update()

    return
Beispiel #2
0
def split(inp_fn, out_nm):
    #print inp_fn
    inp_fn_numlines = util.line_count(inp_fn)

    #print out_nm

    #print inp_fn
    num_splits = 15
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    #print 'Using split size %s' % (split_size)

    split_num = 0
    for idx in range(1, inp_fn_numlines, split_size):
        start = idx
        end = start + split_size
        out_fn = out_dir + out_nm + '_%s.fastq' % (split_num)
        command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn,
                                                        end - start, out_fn)
        split_num += 1
        print command

    return
def predict(inp_fn):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    _predict2.init_model(run_iter='aay', param_iter='aae')
    df_buffer = init_df_buffer()
    df_buffer_nm = ''

    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                header = line.strip()
                if df_buffer_nm == '':
                    df_buffer_nm = header

            if i % 2 == 1:
                sequence = line.strip()
                if len(sequence) < 60:
                    continue
                df_buffer = add_del_profiles(header, sequence, df_buffer)

                print len(df_buffer)
                if len(df_buffer) > 100000:
                    flush_df_buffer(df_buffer, df_buffer_nm)
                    df_buffer_nm = ''
                    df_buffer = init_df_buffer()
            timer.update()
    return
Beispiel #4
0
def demultiplex(split):
  inp_fn = inp_dir + '%s.fq' % (split)
  for name in list(exp_design['Name']) + ['other']:
    util.ensure_dir_exists(out_dir + name)
    util.exists_empty_fn(out_dir + name + '/%s.fa' % (split))

  lc = util.line_count(inp_fn)
  num_bad_q, num_tot = 0, 0
  timer = util.Timer(total = lc)
  with open(inp_fn) as f:
    for i, line in enumerate(f):
      if i % 4 == 0:
        header = line.strip()
      if i % 4 == 1:
        read = line.strip()
      if i % 4 == 3:
        num_tot += 1
        qs = line.strip()
        quals = [ord(s)-33 for s in qs]
        if np.mean(quals) < 30:
          num_bad_q += 1
          continue

        demultiplex_id, trimmed_read = match(read, header)
        
        out_fn = out_dir +  '%s/%s.fa' % (demultiplex_id, split)
        with open(out_fn, 'a') as f:
          f.write('>' + header[1:] + '\n' + trimmed_read + '\n')
      
      timer.update()

  print 'Rejected %s fraction of reads' % (num_bad_q / num_tot)

  return
Beispiel #5
0
def split_by_lines(inp_dir):
    # Splits a folder into groups by lines within each file
    # Used for scripts that operate line-by-line on all files

    for fn in os.listdir(inp_dir):
        if fnmatch.fnmatch(fn, _parallel_config.REGEX_FILTER):
            nl = util.line_count(inp_dir + fn)
            jump = nl / _parallel_config.SPLITS
            jump = (jump / _parallel_config.LINES_DIVISOR
                    ) * _parallel_config.LINES_DIVISOR

            for i in range(_parallel_config.SPLITS):
                if i < _parallel_config.SPLITS - 1:
                    arg = str(jump *  i + 1) + ',' + \
                    str(jump * (i + 1)) + 'p;' + \
                    str(jump * (i + 1) + 1) + 'q'
                else:
                    arg = str(jump *  i + 1) + ',' + \
                    str(nl) + 'p'

                # sed grabs a range of lines in a file
                subprocess.call('sed -n \'' + arg + '\' ' + inp_dir + \
                  fn + ' > ' + inp_dir + 'split' + str(i) + '/' + fn,
                  shell = True)
    return
Beispiel #6
0
def call_mutations(nm):
    inp_fn = inp_dir + f'{nm}.sam'

    mut_dd = defaultdict(list)
    n_d = defaultdict(lambda: 0)
    n_d2 = defaultdict(list)
    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            timer.update()

            if line[0] == '@':
                continue

            w = line.split()
            sam = {
                'read_nm': w[0],
                'target': w[2],
                '1-based pos': int(w[3]),
                'cigar': w[5],
                'seq': w[9],
            }

            if sam['target'] != 'SP055-rpoZ-cMyc-Cry1Ac1-d123':
                continue

            if sam['cigar'] == '*':
                continue

            # Call mutation and Track total readcount per position
            add_mutations(mut_dd, n_d, n_d2, sam)

    mut_df = pd.DataFrame(mut_dd)
    mut_df.to_csv(out_dir + f'{nm}.csv')

    n_dd = defaultdict(list)
    for pos in range(len(ref)):
        n_dd['Position (0 based)'].append(pos)
        n_dd['Read count'].append(n_d[pos])

    n_df = pd.DataFrame(n_dd)
    n_df.to_csv(out_dir + f'{nm}_readcounts.csv')

    ndf2 = pd.DataFrame(n_d2)
    ndf2.to_csv(out_dir + f'{nm}_read_idxs.csv')
    '''
    Important note on ndf2:
    - Many paired reads appear to have sequenced the same molecule. Mutations observed on paired reads are combined; overlapping paired reads are also expected to be combined.

    This is done in ill_b2_merge_n_paired_reads.py
  '''

    return
def run_align_needleman_wunsch(srr, nm):
    inp_fn = inp_dir + f'{srr}.fastq'
    genome_fn = inp_dir + 'SP055-rpoZ-cMyc-Cry1Ac1-d123.fa'

    target = open(genome_fn).readlines()[1].strip()

    seq_align_tool = '/ahg/regevdata/projects/CRISPR-libraries/tools/seq-align/bin/needleman_wunsch'

    out_fn = out_dir + f'{nm}.fa'
    with open(out_fn, 'w') as f:
        pass

    alignment_buffer = []

    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 4 == 0:
                header = line.strip()
                read_nm = header.split()[0].replace('@', '')
            if i % 4 == 1:
                read = line.strip()
            if i % 4 == 3:
                qs = [ord(s) - 33 for s in line.strip()]
                if np.mean(qs) >= 30:

                    read = compbio.reverse_complement(read)

                    command = f'{seq_align_tool} --match 1 --mismatch -1 --gapopen -5 --gapextend -1 --freestartgap --freeendgap {read} {target}'
                    align = subprocess.check_output(command,
                                                    shell=True).decode('utf-8')
                    align = align[:-2]

                    alignment_buffer.append(f'>{read_nm}\n{align}\n')

                    if len(alignment_buffer) > 100:
                        print(f'Dumping alignment buffer...')
                        with open(out_fn, 'a') as f:
                            for item in alignment_buffer:
                                f.write(item)
                        alignment_buffer = []

            timer.update()

    print(f'Dumping alignment buffer...')
    with open(out_fn, 'a') as f:
        for item in alignment_buffer:
            f.write(item)
    alignment_buffer = []

    return
def convert_sam_to_text(ref, sample_id):
    inp_fn = inp_dir + f'{ref}/{sample_id}.sam'

    ref_fn = _config.DATA_DIR + f'{ref}.fa'
    ref_seq = open(ref_fn).readlines()[-1].strip()

    # Parse SAM
    mut_dd = defaultdict(list)
    nd = {idx: 0 for idx in range(len(ref_seq))}
    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            timer.update()

            if line[0] == '@':
                continue

            w = line.split()
            sam = {
                'read_nm': w[0],
                'target': w[2],
                '1-based pos': int(w[3]),
                'cigar': w[5],
                'seq': w[9],
                'qs': w[10],
            }

            if sam['cigar'] == '*':
                continue

            get_alignment(mut_dd, sam, nd, ref_seq)

    mut_df = pd.DataFrame(mut_dd)
    ref_out_dir = out_dir + f'{ref}/'
    util.ensure_dir_exists(ref_out_dir)
    mut_df.to_csv(ref_out_dir + f'{sample_id}.csv')

    ndd = {
        'Position (0 based)': list(nd.keys()),
        'Read count': list(nd.values()),
    }
    ndf = pd.DataFrame(ndd)
    ndf.to_csv(ref_out_dir + f'n_{sample_id}.csv')

    return
def convert_alignment(srr_id, out_dir):
    print srr_id
    if srr_id not in _config.d.RUNS_SET:
        return 'Bad srr_id %s' % (srr_id)
    sam_fn = _config.d.sam_fn(srr_id)
    genome_build, exp_chrm, exp_pos = get_expected_chrm_pos(srr_id)

    num_aligns, num_distant = 0, 0
    align_collection = defaultdict(lambda: 0)

    timer = util.Timer(total=util.line_count(sam_fn))
    with open(sam_fn) as f:
        for i, line in enumerate(f):
            if not line.startswith('@'):
                num_aligns += 1
                chrm = line.split()[2]
                start = int(line.split()[3])
                cigar = line.split()[5]
                read = line.split()[9]

                if abs(exp_pos - start) > 1000:
                    num_distant += 1
                    continue

                align_len = get_align_len(read, cigar)
                genome = query_genome(genome_build, chrm, start, align_len,
                                      srr_id)
                align = construct_align(read, genome, cigar, start)
                align_collection[align] += 1
            timer.update()

    sorted_aligns = sorted(align_collection,
                           key=align_collection.get,
                           reverse=True)

    out_fn = out_dir + '%s.txt' % (srr_id)
    with open(out_fn, 'w') as f:
        for align in sorted_aligns:
            count = align_collection[align]
            f.write('>%s_%s' % (count, align))

    print '%s distant out of %s alignments: %s' % (num_distant, num_aligns,
                                                   num_distant / num_aligns)
    print 'Done'
    return
def find_cutsites_and_predict(inp_fn, data_nm, split):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    _predict.init_model(run_iter='aax', param_iter='aag')
    dd = defaultdict(list)
    dd_shuffled = defaultdict(list)

    if data_nm == 'exons':
        df_out_dir = exon_dfs_out_dir
    elif data_nm == 'introns':
        df_out_dir = intron_dfs_out_dir

    num_flushed = 0
    timer = util.Timer(total=util.line_count(inp_fn))
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                header = line.strip()
            if i % 2 == 1:
                sequence = line.strip()

                if len(sequence) < 60:
                    continue
                if len(sequence) > 500000:
                    continue

                bulk_predict(header, sequence, dd, dd_shuffled, df_out_dir)
                dd, dd_shuffled, num_flushed = maybe_flush(
                    dd, dd_shuffled, data_nm, split, num_flushed)

            if (i - 1) % 50 == 0 and i > 1:
                print '%s pct, %s' % (i / 500, datetime.datetime.now())

            timer.update()

    maybe_flush(dd, dd_shuffled, data_nm, split, num_flushed, force=True)
    return
Beispiel #11
0
def count_grna(exp, lib, split):

    reads_fn = inp_dir + exp + '/%s.fa' % (split)

    # Handle potential duplicates in designed gRNAs by placing counts only in the first occurrence
    grna_set = set(lib['gRNA sequence'])
    grna_list = list(lib['gRNA sequence'])
    idxs = dict()
    for grna in grna_set:
        idxs[grna] = grna_list.index(grna)

    # Init list to be joined to lib dataframe in same order as gRNA sequence
    counts = [0] * len(grna_list)

    tot = 0
    num_reads_matched = 0
    timer = util.Timer(total=util.line_count(reads_fn))
    with open(reads_fn) as f:
        for i, line in enumerate(f):
            if i % 2 == 0:
                header = line.strip()
            else:
                read = line.strip()
                matched_grna = find_grna(exp, grna_set, read)
                if matched_grna is not False:
                    counts[idxs[matched_grna]] += 1
                    num_reads_matched += 1
                tot += 1
            timer.update()

    try:
        pct_reads_matched = float(num_reads_matched) / tot
    except ZeroDivisionError:
        pct_reads_matched = np.nan
    print num_reads_matched, '/', tot
    print pct_reads_matched * 100, '%', ' reads matched'

    return counts
Beispiel #12
0
def divide():
    inp_fn = inp_dir + 'SHE2655.fq'

    inp_fn_numlines = util.line_count(inp_fn)

    num_splits = 60
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    print 'Using split size %s' % (split_size)

    split_num = 0
    for idx in range(1, inp_fn_numlines, split_size):
        start = idx
        end = start + split_size
        out_fn = out_dir + '%s.fq' % (split_num)
        command = 'tail -n +%s %s | head -n %s > %s' % (start, inp_fn,
                                                        end - start, out_fn)
        split_num += 1
        print command

    return
def matchmaker(nm, split):
  print nm, split
  stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split)
  util.exists_empty_fn(stdout_fn)
  out_dir = out_place + nm + '/' + split + '/'
  util.ensure_dir_exists(out_dir)

  inp_fn = inp_dir + '%s_r2_%s.fq' % (nm, split)

  lsh_dict = build_targets_better_lsh()
  alignment_buffer = init_alignment_buffer()

  prepare_outfns(out_dir)

  qf = 0

  tot_reads = util.line_count(inp_fn)
  timer = util.Timer(total = tot_reads)
  from itertools import izip
  with open(inp_fn) as f:
    for i, line in enumerate(f):
      if i % 4 == 0:
        pass
      if i % 4 == 1:
        l2 = line.strip()
      if i % 4 == 3:
        # Quality filter
        q2 = line.strip()
        qs = [ord(s)-33 for s in q2]
        if np.mean(qs) < 28:
          qf += 1
          continue

        l2 = compbio.reverse_complement(l2)
        align_header = '>1'

        # Try to find designed target from LSH
        cand_idxs = find_best_designed_target(l2, lsh_dict)
        if len(cand_idxs) == 0:
          continue

        # Run alignment
        best_idx, align = alignment(l2, cand_idxs)

        # Store alignment into buffer
        store_alignment(alignment_buffer, best_idx, align_header, align)

      if i % int(tot_reads / 100) == 1 and i > 1:
        # Flush alignment buffer
        flush_alignments(alignment_buffer, out_dir)
        alignment_buffer = init_alignment_buffer()

        # Stats for the curious
        with open(stdout_fn, 'a') as outf:
          outf.write('Time: %s\n' % (datetime.datetime.now()))
          outf.write('Progress: %s\n' % (i / int(tot_reads / 100)) )
          outf.write('Quality filtered pct: %s\n' % (qf / (i/4)))
      timer.update()
  
  # Final flush
  flush_alignments(alignment_buffer, out_dir)

  return
def matchmaker(nm, split):
  print(split)
  stdout_fn = _config.SRC_DIR + f'nh_c_{nm}_{split}.out'
  util.exists_empty_fn(stdout_fn)
  out_dir = f'{out_place}{nm}/{split}/'
  util.ensure_dir_exists(out_dir)

  # Parse condition-specific settings
  exp_row = exp_design[exp_design['Name'] == nm].iloc[0]
  parent_fn = exp_row['Parent file']
  lib_nm = exp_row['Library']
  target_nm = exp_row['Target']

  # Library design
  global lib_design
  lib_design = pd.read_csv(_config.DATA_DIR + f'lib_{lib_nm}_design.csv')

  global prefixes
  global peptide_nms
  global prefix_to_peptide
  global suffixes
  global suffix_to_peptide
  prefixes = [s[:prefix_len] for s in lib_design['Sequence']]
  peptide_nms = list(lib_design['Name'])
  prefix_to_peptide = {prefix: nm for prefix, nm in zip(prefixes, peptide_nms)}
  suffixes = [compbio.reverse_complement(s[-suffix_len:]) for s in lib_design['Sequence']]
  suffix_to_peptide = {suffix: nm for suffix, nm in zip(suffixes, peptide_nms)}

  # Target 
  target_row = target_design[target_design['Target'] == target_nm].iloc[0]
  target = target_row['Sequence']
  target_strand = target_row['gRNA orientation']

  zf_split = str(split).zfill(3)
  read1_fn = inp_dir + f'{parent_fn}_R1_{zf_split}.fq'
  read2_fn = inp_dir + f'{parent_fn}_R2_{zf_split}.fq'

  count_stats = defaultdict(lambda: 0)
  count_stats['Success'] = 0

  alignment_buffer = init_alignment_buffer()
  prepare_outfns(out_dir, peptide_nms)

  tot_lines = util.line_count(read1_fn)
  timer = util.Timer(total = tot_lines)
  with open(read1_fn) as f1, open(read2_fn) as f2:
    for i, (line1, line2) in enumerate(zip(f1, f2)):
      if i % 4 == 0:
        h1 = line1.strip()
        h2 = line2.strip()
      if i % 4 == 1:
        read1 = line1.strip()
        read2 = line2.strip()
      if i % 4 == 3:
        q1, q2 = line1.strip(), line2.strip()
        count_stats['Read count'] += 1

        qs = [ord(s)-33 for s in q1 + q2]
        if np.mean(qs) < 25:
          count_stats['1a. Quality fail'] += 1
          continue

        res, msg = find_peptide1_nm(read2)
        if res is None:
          count_stats[f'2{msg}'] += 1
          continue
        p1_nm = res

        res, msg = find_peptide2_nm(read1)
        if res is None:
          count_stats[f'2{msg}'] += 1
          continue
        p2_nm = res

        peptide_nm = f'{p1_nm}-{p2_nm}'

        read1 = read1[6:]
        q1 = q1[6:]
        if target_strand == '-':
          read1 = compbio.reverse_complement(read1)
          q1 = q1[::-1]

        # Run alignment and store in buffer
        align_header = f'>1'
        align = alignment(read1, target)
        store_alignment(alignment_buffer, peptide_nm, align_header, align, q1)
        count_stats['Success'] += 1

      # flush_interval = 2000
      flush_interval = 200
      if i % int(tot_lines / flush_interval) == 1 and i > 1:
        # Flush alignment buffer
        flush_alignments(alignment_buffer, out_dir)
        alignment_buffer = init_alignment_buffer()

        # Stats for the curious
        with open(stdout_fn, 'a') as outf:
          outf.write(f'Time: {datetime.datetime.now()}\n')
          outf.write(f'Progress: {i / int(tot_lines / 100)}\n')
          outf.write(f'Line: {i}\n')
          for key in sorted(list(count_stats.keys())):
            outf.write(f'{key}, {count_stats[key]}\n')
        # break

      timer.update()
  
  # Final flush
  flush_alignments(alignment_buffer, out_dir)

  stats_df = pd.DataFrame(count_stats, index = [0])
  sorted_cols = sorted([s for s in stats_df.columns])
  stats_df = stats_df[sorted_cols]
  stats_df.to_csv(out_dir + f'stats_{nm}_{split}.csv')

  return
Beispiel #15
0
def matchmaker(nm, split):
    print(nm, split)
    stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split)
    util.exists_empty_fn(stdout_fn)
    out_dir = out_place + nm + '/' + split + '/'
    util.ensure_dir_exists(out_dir)

    read1_fn = inp_dir + '%s_R1_%s.fq' % (nm, split)
    read2_fn = inp_dir + '%s_R2_%s.fq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    alignment_buffer = init_alignment_buffer()

    prepare_outfns(out_dir)

    num_bad_matches = 0
    quality_pass = 0

    tot_lines = util.line_count(read1_fn)
    timer = util.Timer(total=tot_lines)
    with open(read1_fn) as f1, open(read2_fn) as f2:
        for i, (line1, line2) in enumerate(zip(f1, f2)):
            if i % 4 == 0:
                h1 = line1.strip()
                h2 = line2.strip()
            if i % 4 == 1:
                # RC of l1 contains target
                line1 = line1.strip()
                target_read = compbio.reverse_complement(line1[:61])
                ulmi, ulmi_idx = find_ulmi(line1)

                # l2 contains gRNA
                grna_read = line2.strip()

            if i % 4 == 3:

                q1, q2 = line1.strip(), line2.strip()
                read_q = q1[:61][::-1]
                ulmi_q = q1[ulmi_idx:ulmi_idx + len(ulmi)][::-1]
                grna_q = q2[18:22 + 20]

                qs = [ord(s) - 33 for s in read_q + ulmi_q + grna_q]
                if np.mean(qs) >= 28:
                    quality_pass += 1

                    align_header = '>1_%s_%s' % (ulmi, ulmi_q)

                    # Try to find designed target from LSH
                    cand_idxs = find_best_designed_target(
                        target_read, lsh_dict)
                    if len(cand_idxs) > 0:

                        bad_match = compare_target_to_grna(
                            cand_idxs, grna_read)
                        if bad_match == 'ok':
                            # Run alignment and store in buffer
                            best_idx, align = alignment(target_read, cand_idxs)
                            if align is None:
                                continue
                            store_alignment(alignment_buffer, best_idx,
                                            align_header, align, read_q)
                        else:
                            num_bad_matches += 1
                    else:
                        num_bad_matches += 1

            if i % int(tot_lines / 200) == 1 and i > 1:
                # Flush alignment buffer
                flush_alignments(alignment_buffer, out_dir)
                alignment_buffer = init_alignment_buffer()

                # Stats for the curious
                with open(stdout_fn, 'a') as outf:
                    outf.write('Time: %s\n' % (datetime.datetime.now()))
                    outf.write('Progress: %s\n' % (i / int(tot_lines / 100)))
                    outf.write('Num. mismatched gRNA/target pairs: %s\n' %
                               (num_bad_matches))
                    outf.write('Frac. mismatched gRNA/target pairs: %s\n' %
                               (num_bad_matches / quality_pass))

            timer.update()

    # Final flush
    flush_alignments(alignment_buffer, out_dir)

    return
Beispiel #16
0
def matchmaker(nm, split):

    ##CUSTOM CODE FOR DICTIONARY CREATION
    from Bio import pairwise2
    from Bio.pairwise2 import format_alignment

    from Bio.Seq import Seq
    from Bio.Alphabet import generic_dna

    def rc(inp):
        d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
        return "".join([d[e] for e in inp.strip()[::-1]])

    #UNSPLICED DATA PROCESSING
    READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg"
    READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca"

    r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper())
    r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper())

    def quality(line):
        q_1 = line.strip()
        qs = [ord(s) - 33 for s in q_1]
        return np.mean(qs)

    i = -1

    qc_rejection_count = 0
    read1_rejection_count = 0
    constant_region_rejection_count = 0
    accepted_count = 0
    nolib_rejection_count = 0

    print nm, split

    #fq_unspliced_1 = open("/cluster/bh0085/prj/exons/data/{0}_1_sequence.fastq".format(nm))
    #fq_unspliced_2 = open("/cluster/bh0085/prj/exons/data/{0}_2_sequence.fastq".format(nm))

    stdout_fn = _config.SRC_DIR + 'b3_status_%s_%s.out' % (nm, split)
    util.exists_empty_fn(stdout_fn)

    out_dir = out_place + nm + '/' + split + '/'
    util.ensure_dir_exists(out_dir)

    inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split)
    inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    umis_alignments_buffer = init_umis_alignments_buffer()
    short_outputs = []

    prepare_outfns(out_dir)

    qf = 0

    print inp_fn1
    tot_reads = util.line_count(inp_fn1)

    timer = util.Timer(total=tot_reads)

    i = -1

    print "OPENING FILES"
    with open(inp_fn1) as f1:
        with open(inp_fn2) as f2:
            while 1:
                i += 1

                try:
                    r2_l = f2.next()
                    r1_l = f1.next()
                except StopIteration as e:
                    break

                if i % 4 == 1:
                    read1 = r1_l
                    read2 = r2_l

                if i % 4 == 3:
                    if quality(r2_l) < 28 or quality(r1_l) < 28:
                        qc_rejection_count += 1
                        continue

                    r1_library_constant = "TACCAGCTGCCCTCGTCGAC".upper()
                    r1_library_start = len(r1_library_constant)
                    r1_library_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNN"
                    r1_library_intron_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAG"
                    r1_library_ag_pos = len(
                        "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
                    r1_library_exon_format = "NNNNNNNNNNNNNNNNNNNNNNNN"

                    r2_library_constant = "ggggtgttctgctggtagtggtc".upper()
                    r2_library_start = len(r2_library_constant)
                    r2_umi_format = "NNNNNNNNNNNNNNN"

                    try:
                        a1_offset = read1.upper().index(
                            r1_library_constant.upper())
                    except ValueError, e:
                        a1_offset = None
                    try:
                        a2_offset = read2.upper().index(r2_library_constant)
                    except ValueError, e:
                        a2_offset = None

                    if a1_offset is None or a2_offset is None:
                        constant_region_rejection_count += 1
                        continue

                    read1_const = read1[a1_offset:a1_offset + r1_library_start]
                    read1_content = read1[
                        a1_offset + r1_library_start:][:len(r1_library_format)]
                    read1_extended_content = read1[a1_offset +
                                                   r1_library_start:]
                    read2_const = read1[a2_offset:a2_offset + r2_library_start]
                    read2_content = read2[a2_offset +
                                          r2_library_start:][:len(r2_umi_format
                                                                  )]
                    read2_extended_content = read1[a2_offset +
                                                   r2_library_start:]

                    r1_ag = read1_content[len(r1_library_intron_format) -
                                          2:len(r1_library_intron_format)]

                    #check to see that the splice acceptor is in the right position
                    #and that the read1 constant sequence aligned

                    #if a1_tag_score <20:
                    #    read1_rejection_count+=1
                    #    continue

                    tag = "TACCANCTGCCCTCGTCGAC"
                    umi = read2_content[:len(r2_umi_format)]
                    lib = read1_content[:len(r1_library_format)]
                    lib_extended = read1_extended_content[:len(
                        r1_library_format) + 20]

                    if umi.count("N") != 0 or lib.count("N") != 0: continue

                    #no longer check for perfect matches. Just align
                    exp = target_names.get(lib, None)

                    cand_idxs = find_best_designed_target(lib, lsh_dict)
                    if len(cand_idxs) == 0:
                        print "rejecting for no good match"
                        nolib_rejection_count += 1
                        continue

                    best_idx = cand_idxs[0]
                    #extends a target alignment region to include an extra 20 bases to anchor the alignment for long r1 deletions
                    target_alignment_region = names_targets[
                        best_idx] + "tgattacacatatagacacg".upper()
                    align = pairwise2.align.localms(target_alignment_region,
                                                    read1_extended_content, 2,
                                                    -1, -5, -.1)[0]

                    output_complete = """>1\n{0}\n{1}\n{2}\n{3}\n""".format(
                        umi, best_idx, align[2],
                        "\n".join(format_alignment(*align).splitlines()[:3]))
                    output_short = (umi, best_idx)

                    umis_alignments_buffer[umi].append(output_complete)
                    short_outputs.append(output_short)
                    accepted_count += 1

                    if i % int(tot_reads / 1000) < 4 and i > 1:
                        print i
                        print "FLUSHING!"
                        # Flush alignment buffer
                        flush_tuples(umis_alignments_buffer, out_dir)
                        print len(umis_alignments_buffer.keys())

                        # Stats for the curious
                        with open(stdout_fn, 'a') as outf:
                            outf.write('Time: %s\n' %
                                       (datetime.datetime.now()))
                            outf.write('Progress: %s\n' %
                                       (i / int(tot_reads / 100)))
                            outf.write('Quality filtered pct: %s\n' %
                                       (qf / (i / 4)))
                            outf.write(
                                "accepted {0}, rejected {1} bad read1, {2} bad lib\n"
                                .format(accepted_count, read1_rejection_count,
                                        nolib_rejection_count))

                    timer.update()
def matchmaker(nm, split):

    read_constant_rejection_count = 0
    qc_rejection_count = 0
    accepted_count = 0
    grna_failure_count = 0
    read1_rejection_count = 0
    ##CUSTOM CODE FOR DICTIONARY CREATION
    from Bio import pairwise2
    from Bio.pairwise2 import format_alignment

    from Bio.Seq import Seq
    from Bio.Alphabet import generic_dna

    def rc(inp):
        d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
        return "".join([d[e] for e in inp.strip()[::-1]])

    #UNSPLICED DATA PROCESSING
    READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg"
    READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca"

    r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper())
    r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper())

    def quality(line):
        q_1 = line.strip()
        qs = [ord(s) - 33 for s in q_1]
        return np.mean(qs)

    i = -1

    print nm, split

    umis_alignments_buffer = init_umis_alignments_buffer()
    stdout_fn = _config.SRC_DIR + 'b7_status_%s_%s.out' % (nm, split)
    util.exists_empty_fn(stdout_fn)
    out_dir = out_place + nm + '/' + split + '/'
    util.ensure_dir_exists(out_dir)

    inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split)
    inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split)

    short_outputs = []

    prepare_outfns(out_dir)
    qf = 0
    tot_reads = util.line_count(inp_fn1)
    timer = util.Timer(total=tot_reads)

    #raise Exception()
    i = -1
    with open(inp_fn1) as f1:
        with open(inp_fn2) as f2:
            while 1:
                i += 1
                try:
                    r2_l = f2.next()
                    r1_l = f1.next()
                except StopIteration as e:
                    break
                if i % 4 == 1:
                    read1 = r1_l
                    read2 = r2_l
                if i % 4 == 3:
                    if quality(r2_l) < 28 or quality(r1_l) < 28:
                        qc_rejection_count += 1
                        continue

                    print read1
                    print read2
                    print len(read2)
                    r1_grna19_format = "N" * 19
                    r1_grna20_format = "N" * 20
                    r2_umi_format = "N" * 15

                    r1_prefix_constant = "GACGAAACACCG".upper()
                    r1_grna_start = len(r1_prefix_constant)

                    r2_prefix_constant = "tcaaacaggacggcagcgtgcagctcgcc".upper(
                    )
                    r2_umi_start = len(r2_prefix_constant)
                    r2_umi_format = "N" * 15
                    r2_post_umi_format = "gaccactaccagcagaacacccc".upper()

                    print "working"
                    try:
                        print r1_prefix_constant
                        a1_offset = read1.upper().index(
                            r1_prefix_constant.upper())
                    except Exception, e:
                        read1_rejection_count += 1
                        a1_offset = None
                        print "A1 EXCEPTION"
                        continue
                    try:
                        a2_offset = read2.upper().index(
                            r2_prefix_constant.upper())
                    except Exception, e:
                        a2_offset = None
                        read_constant_rejection_count += 1
                        print "A2 REJECTION"
                        continue

                    read1_grna19 = read1[a1_offset +
                                         r1_grna_start:][:len(r1_grna19_format
                                                              )]
                    read1_grna20 = read1[a1_offset +
                                         r1_grna_start:][:len(r1_grna20_format
                                                              )]
                    read2_umi_content = read2[a2_offset +
                                              r2_umi_start:][:len(r2_umi_format
                                                                  )]

                    print a2_offset
                    print r2_umi_start
                    print len(r2_umi_format)
                    print len(read2_umi_content)

                    #raise Exception()

                    design_row = exp_design.loc[exp_design[
                        "Designed gRNA (NGG orientation, 19 and 20)"] ==
                                                read1_grna20]
                    if len(design_row) == 0:
                        design_row = exp_design.loc[exp_design[
                            "Designed gRNA (NGG orientation, 19 and 20)"] ==
                                                    read1_grna19]
                    if len(design_row) == 0:
                        grna_failure_count += 1
                        continue

                    design_row = design_row.iloc[0]

                    output_complete = """>1\n{0}\n{1}""".format(
                        read2_umi_content, design_row["Identifier number"])
                    output_short = (read2_umi_content,
                                    design_row["Identifier number"])

                    print output_short

                    umis_alignments_buffer[read2_umi_content].append(
                        output_complete)
                    short_outputs.append(output_short)
                    accepted_count += 1

                    if i % int(tot_reads / 10) < 4 and i > 1:

                        print "FLUSHING!"
                        print accepted_count
                        # Flush alignment buffer
                        flush_tuples(umis_alignments_buffer, out_dir)
                        print len(umis_alignments_buffer.keys())

                        # Stats for the curious
                        with open(stdout_fn, 'a') as outf:
                            outf.write('Time: %s\n' %
                                       (datetime.datetime.now()))
                            outf.write('Progress: %s\n' %
                                       (i / int(tot_reads / 100)))
                            outf.write('Quality filtered pct: %s\n' %
                                       (qf / (i / 4)))
                            outf.write(
                                "accepted {0}, rejected {1} bad read1\n{2} rc rejection\n"
                                .format(accepted_count, read1_rejection_count,
                                        read_constant_rejection_count))

                    timer.update()