Esempio n. 1
0
def extract_seqs_for_edlib(ref_seq, contig_seq, rstart, rend, qstart, qend):
	if (rend > rstart):
		nw_ref = ref_seq[(rstart-1):(rend+1-1)];	# +1 because the end base is inclusive, and -1 because it's 1-based.
	else:
		nw_ref = fastqparser.revcomp_seq(ref_seq[(rend-1):(rstart+1-1)]);

	if (qend > qstart):
		nw_contig = contig_seq[(qstart-1):(qend+1-1)];	# +1 because the end base is inclusive, and -1 because it's 1-based.
	else:
		nw_contig = fastqparser.revcomp_seq(contig_seq[(qend-1):(qstart+1-1)]);

	return [nw_ref, nw_contig];
Esempio n. 2
0
def get_circular_score(ref_path, contig_path, temp_folder):
    if (not os.path.exists(temp_folder)):
        os.makedirs(temp_folder)

    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path)

    circularized_fwd_path = '%s/circ-fwd.fa' % (temp_folder)
    circularized_rev_path = '%s/circ-rev.fa' % (temp_folder)

    fp_fwd = open(circularized_fwd_path, 'w')
    fp_rev = open(circularized_rev_path, 'w')

    for i in xrange(0, len(seqs_ref)):
        rev_seq = fastqparser.revcomp_seq(seqs_ref[i])
        rev_qual = quals_ref[i][::-1]
        # if (len(quals_ref) > 0):
        # 	fp_fwd.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i], quals_ref[i], quals_ref[i]));
        # 	fp_rev.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq, rev_qual, rev_qual));
        # else:
        fp_fwd.write('>%s\n%s%s\n' %
                     (headers_ref[i], seqs_ref[i], seqs_ref[i]))
        fp_rev.write('>%s\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq))

    fp_fwd.close()
    fp_rev.close()

    # sys.stdout.write('Aligning the fwd orientation...\n');
    # command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_fwd_path);
    # [rc_fwd, rstdout_fwd, rstderr_fwd] = execute_command_with_ret(DRY_RUN, command);
    # scores_fwd = parse_edlib_scores(rstdout_fwd);
    # for i in xrange(0, len(scores_fwd)):
    # 	sys.stdout.write('[%d] %d %s\n' % (i, scores_fwd[i], 'fwd'));
    # sys.stdout.write('\n');

    sys.stdout.write('Aligning the rev orientation...\n')
    command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path,
                                  circularized_rev_path)
    [rc_rev, rstdout_rev,
     rstderr_rev] = execute_command_with_ret(DRY_RUN, command)
    scores_rev = parse_edlib_scores(rstdout_rev)
    for i in xrange(0, len(scores_rev)):
        sys.stdout.write('[%d] %d %s\n' % (i, scores_rev[i], 'rev'))
    sys.stdout.write('\n')
Esempio n. 3
0
def extract_seqs_for_edlib(temp_folder,
                           temp_suffix,
                           ref_path,
                           contig_path,
                           rstart,
                           rend,
                           qstart,
                           qend,
                           is_fwd,
                           rname,
                           qname,
                           generate_kmer_spectrum=False):
    if (not os.path.exists(temp_folder)):
        os.makedirs(temp_folder)

    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path)
    [headers_contig, seqs_contig,
     quals_contig] = fastqparser.read_fastq(contig_path)

    ref_hash = hash_headers(headers_ref)
    contig_hash = hash_headers(headers_contig)

    print ref_hash

    if ((rname in ref_hash) == False):
        sys.stderr.write(
            'ERROR: Reference name "%s" not found in file "%s"! Exiting.\n' %
            (rname, ref_path))
        exit(1)
    if ((qname in contig_hash) == False):
        sys.stderr.write(
            'ERROR: Contig name "%s" not found in file "%s"! Exiting.\n' %
            (qname, contig_path))
        exit(1)

    if (rend < rstart):
        sys.stderr.write(
            'ERROR: Reference end should come before reference start (it is expected that the ref is forward oriented), but ref_start = %d, ref_end = %d. Exiting.\n'
            % (ref_start, ref_end))
        exit(1)

    rid = ref_hash[rname]
    ref_header = headers_ref[rid]
    ref_seq = seqs_ref[rid][(rstart - 1):(rend)]
    # Coordinates are 1-based.

    qid = contig_hash[qname]
    contig_header = headers_contig[qid]
    contig_seq = ''
    if (is_fwd):
        if (qend >= qstart):
            contig_seq = seqs_contig[qid][(qstart - 1):(qend)]
        else:
            contig_seq = seqs_contig[qid][(qstart -
                                           1):] + seqs_contig[qid][0:(qend)]
    else:
        if (qend > qstart):
            contig_seq = seqs_contig[qid][(qend -
                                           1):] + seqs_contig[qid][0:(qstart)]
        else:
            contig_seq = seqs_contig[qid][(qend - 1):(qstart)]

        contig_seq = fastqparser.revcomp_seq(contig_seq)

    nw_ref_path = '%s/nw-ref%s.fasta' % (temp_folder, temp_suffix)
    nw_contig_path = '%s/nw-contig%s.fasta' % (temp_folder, temp_suffix)
    nw_kmer_comp_path = '%s/nw-kmers%s.spect' % (temp_folder, temp_suffix)

    fp_nw_ref = open(nw_ref_path, 'w')
    fp_nw_contig = open(nw_contig_path, 'w')
    fp_nw_ref.write('>%s\n%s\n' % (ref_header, ref_seq))
    fp_nw_contig.write('>%s\n%s\n' % (contig_header, contig_seq))
    fp_nw_ref.close()
    fp_nw_contig.close()

    sys.stderr.write('Running Edlib to determine the edit distance...\n')
    command = '%s %s %s -m NW' % (EDLIB_PATH, nw_contig_path, nw_ref_path)

    [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command)
    # execute_command(command, None, False);
    scores = parse_edlib_scores(rstdout)
    unaligned_len = len(seqs_ref[rid]) - len(ref_seq)
    if (len(scores) == 0):
        sys.stderr.write(
            'ERROR: len(scores) == 0!\nreturn code: %d\nrstdout:\n%s\n' %
            (rc, rstdout))
    sys.stderr.write(
        'Final edit distance: %d, aligned edit distance: %d, unaligned ref len: %d, aligned ref len: %d, aligned contig len: %d\n'
        % ((scores[0] + unaligned_len), scores[0], unaligned_len, len(ref_seq),
           len(contig_seq)))
    #	for i in xrange(0, len(scores)):
    #		sys.stdout.write('[%d] edit dist: %d\tunaligned len: %d\n' % (i, scores[i], unaligned_len));

    sys.stdout.write('\n')

    if (generate_kmer_spectrum == True):
        sys.stderr.write('Generating the kmer spectrum.\n')
        command = '%s -o %s %s %s' % (KMERCOMP_PATH, nw_kmer_comp_path,
                                      nw_contig_path, nw_ref_path)
        [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command)
        sys.stderr.write('Stdout:\n%s\nStderr:\n%s\n' % (rstdout, rstderr))
        sys.stderr.write('Done generating the kmer spectrum!\n')
Esempio n. 4
0
			# In this case, something weird happened. Most likely the header got messed up.
			# Another option is that someone changed the reads file. In any case, if the original read
			# cannot be found, we will call this alignment unmapped.
			seq = '*';
			flag = 4;

		qual = '*';
		if (len(read_quals) > 0):
			try:
				qual = read_quals[read_header_hash[qseqid]];
			except:
				qual = '*';

		sam_start = (int(sstart) - 1) if (sstrand == 'plus') else (int(send) - 1);
		sam_end = (int(send) - 1) if (sstrand == 'plus') else (int(sstart) - 1);
		sam_seq = (seq) if (sstrand == 'plus' or seq == '*') else (fastqparser.revcomp_seq(seq));			# Reverse the seq field if necessary.
		sam_qual = (qual) if (sstrand == 'plus' or qual == '*') else (qual[::-1]);							# Reverse the quality values if necessary.
		num_clip_front = int(qstart) - 1;
		num_clip_back = int(qlen) - (int(qend));
		sam_cigar = convert_btop_to_cigar(btop, num_clip_front, num_clip_back, sstrand);

		sam_line = '';
		sam_line += '%s\t' % (qseqid);						# 1. qname
		sam_line += '%d\t' % (flag);						# 2. flag
		sam_line += '%s\t' % (sseqid);						# 3. rname
		sam_line += '%d\t' % (sam_start + 1);				# 4. pos
		sam_line += '255\t';								# 5. mapq
		sam_line += '%s\t' % (sam_cigar);					# 6. CIGAR
		sam_line += '*\t';									# 7. rnext
		sam_line += '0\t';									# 8. pnext
		sam_line += '0\t';									# 9. tlen