Exemple #1
0
def eval_contigs(ref_path,
                 contig_path,
                 temp_folder,
                 generate_kmer_spectrum=False):
    if (not os.path.exists(temp_folder)):
        os.makedirs(temp_folder)

    [headers_contigs, seqs_contigs,
     quals_contigs] = fastqparser.read_fastq(contig_path)
    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path)

    ref_hash = hash_headers(headers_ref)
    contig_hash = hash_headers(headers_contigs)

    single_contig_path = '%s/singlecontig.fasta' % (temp_folder)
    for i in xrange(0, len(seqs_contigs)):
        contig_name = headers_contigs[i].split()[0]
        contig_seq = seqs_contigs[i]

        fp_contig = open(single_contig_path, 'w')
        fp_contig.write('>%s\n%s\n' % (contig_name, seqs_contigs[i]))
        fp_contig.close()

        nucmer_out_prefix = '%s/nucmer' % (temp_folder)
        sys.stderr.write('\n')
        sys.stderr.write('Running MUMmer on contig: "%s"\n' % (contig_name))
        command = '%s --maxmatch --extend -p %s %s %s; delta-filter -r -q %s.delta > %s.filt.delta; show-coords -r -c %s.filt.delta > %s.filt.coords' % \
           (NUCMER_PATH, nucmer_out_prefix, ref_path, single_contig_path, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix)
        # execute_command(command, None, False);
        [rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command)

        sys.stderr.write('\n')
        sys.stderr.write('Parsing the coords file.\n')
        # fp = open('/home/isovic/work/eclipse-workspace/git/consise/temp2-mummer/test-data/out/nucmer.coords2', 'r');
        coords_path = '%s.filt.coords' % (nucmer_out_prefix)
        fp = open(coords_path, 'r')
        lines = fp.readlines()
        fp.close()
        coords = parse_coords_lines(lines, contig_name, seqs_ref, ref_hash,
                                    seqs_contigs, contig_hash)
        print ''
        print 'coords: "%s"' % (coords)
        print 'lines:'
        for line in lines:
            print line
        sys.stdout.flush()
        [rstart, rend, qstart, qend, is_fwd, rname, qname] = coords
        extract_seqs_for_edlib(temp_folder,
                               '.%d' % (i),
                               ref_path,
                               contig_path,
                               rstart,
                               rend,
                               qstart,
                               qend,
                               is_fwd,
                               rname,
                               qname,
                               generate_kmer_spectrum=generate_kmer_spectrum)
        sys.stderr.write('\n')
def extractFromFAST(fast_fname, qnames_fname):

    sys.stderr.write('\nLoading qnames file!')
    qnames = []
    qnames_dict = {}
    with open(qnames_fname, 'rU') as qnames_f:
        qnames = qnames_f.readlines()
        qnames_f.close()
    # Creating a dictionary for faster search
    # Also removing '\n' from the end
    for qname in qnames:
        qnames_dict[qname[:-1]] = 1

    sys.stderr.write('\nLoading FASTA/FASTQ file!')
    [headers, seqs, quals] = read_fastq(fast_fname)

    sys.stderr.write('\nExtracting ...')
    for i in xrange(len(headers)):
        header = headers[i]
        seq = seq[i]
        qual = quals[i]
        qname = header[i]
        if qname in qnames_dict:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                sys.stdout.write('>' + header + '\n')
                sys.stdout.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                sys.stdout.write('@' + header + '\n')
                sys.stdout.write(seq + '\n')
                sys.stdout.write('+' + header + '\n')
                sys.stdout.write(qual + '\n')

    sys.stderr.write('\nFinished!')
def extractFromFAST(fast_fname, qnames_fname):

    sys.stderr.write('\nLoading qnames file!')
    qnames = []
    qnames_dict = {}
    with open(qnames_fname, 'rU') as qnames_f:
        qnames = qnames_f.readlines()
        qnames_f.close()
    # Creating a dictionary for faster search
    # Also removing '\n' from the end
    for qname in qnames:
        qnames_dict[qname[:-1]] = 1

    sys.stderr.write('\nLoading FASTA/FASTQ file!')
    [headers, seqs, quals] = read_fastq(fast_fname)

    sys.stderr.write('\nExtracting ...')
    for i in xrange(len(headers)):
        header = headers[i]
        seq = seq[i]
        qual = quals[i]
        qname = header[i]
        if qname in qnames_dict:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                sys.stdout.write('>' + header + '\n')
                sys.stdout.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                sys.stdout.write('@' + header + '\n')
                sys.stdout.write(seq + '\n')
                sys.stdout.write('+' + header + '\n')
                sys.stdout.write(qual + '\n')

    sys.stderr.write('\nFinished!')
Exemple #4
0
def prepare_human_genome(genome_file):
    filename, file_extension = os.path.splitext(genome_file)
    processed_genome_file = filename + "_P" + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, "w") as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = "chr19"
            seq = seqs[i]
            qual = quals[i]

            if (header.find("chromosome 19") > -1
                    and header.find("Primary Assembly") > -1):

                if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]:
                    pgfile.write(">" + new_header + "\n")
                    pgfile.write(seq + "\n")
                elif file_extension.lower() in [".fq", ".fastq"]:
                    pgfile.write("@" + new_header + "\n")
                    pgfile.write(seq + "\n")
                    pgfile.write("+" + new_header + "\n")
                    pgfile.write(qual + "\n")
                else:
                    pgfile.write(r"@ERROR occured. File is NOT COMPLETE!")
                    raise Exception("Invalid file extension: %s" %
                                    file_extension)

                break
Exemple #5
0
def prepare_human_genome(genome_file):
    filename, file_extension = os.path.splitext(genome_file)
    processed_genome_file = filename + '_P' + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, 'w') as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = 'chr19'
            seq = seqs[i]
            qual = quals[i]

            if header.find('chromosome 19') > -1 and header.find('Primary Assembly') > -1:

                if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                    pgfile.write('>' + new_header + '\n')
                    pgfile.write(seq + '\n')
                elif file_extension.lower() in ['.fq', '.fastq']:
                    pgfile.write('@' + new_header + '\n')
                    pgfile.write(seq + '\n')
                    pgfile.write('+' + new_header + '\n')
                    pgfile.write(qual + '\n')
                else:
                    pgfile.write(r'@ERROR occured. File is NOT COMPLETE!')
                    raise Exception('Invalid file extension: %s' % file_extension)

                break
def prepare_human_genome(genome_file):
    filename, file_extension = os.path.splitext(genome_file)
    processed_genome_file = filename + '_P' + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, 'w') as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = 'chr19'
            seq = seqs[i]
            qual = quals[i]

            if header.find('chromosome 19') > -1 and header.find('Primary Assembly') > -1:

                if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                    pgfile.write('>' + new_header + '\n')
                    pgfile.write(seq + '\n')
                elif file_extension.lower() in ['.fq', '.fastq']:
                    pgfile.write('@' + new_header + '\n')
                    pgfile.write(seq + '\n')
                    pgfile.write('+' + new_header + '\n')
                    pgfile.write(qual + '\n')
                else:
                    pgfile.write(r'@ERROR occured. File is NOT COMPLETE!')
                    raise Exception('Invalid file extension: %s' % file_extension)

                break
def extractFromFAST(fast_fname, qnames_fname):

    sys.stderr.write("\nLoading qnames file!")
    qnames = []
    qnames_dict = {}
    with open(qnames_fname, "rU") as qnames_f:
        qnames = qnames_f.readlines()
        qnames_f.close()
    # Creating a dictionary for faster search
    # Also removing '\n' from the end
    for qname in qnames:
        qnames_dict[qname[:-1]] = 1

    sys.stderr.write("\nLoading FASTA/FASTQ file!")
    [headers, seqs, quals] = read_fastq(fast_fname)

    sys.stderr.write("\nExtracting ...")
    for i in range(len(headers)):
        header = headers[i]
        seq = seq[i]
        qual = quals[i]
        qname = header[i]
        if qname in qnames_dict:
            if fext.lower() in [".fa", ".fna", "faa", ".fasta"]:
                sys.stdout.write(">" + header + "\n")
                sys.stdout.write(seq + "\n")
            elif fext.lower() in [".fq", ".fastq"]:
                sys.stdout.write("@" + header + "\n")
                sys.stdout.write(seq + "\n")
                sys.stdout.write("+" + header + "\n")
                sys.stdout.write(qual + "\n")

    sys.stderr.write("\nFinished!")
Exemple #8
0
def prepare_dm_genome(genome_file):
    filename, file_extension = os.path.ext(genome_file)
    processed_genome_file = filename + "_P" + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, "w") as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = "ERROR!"  # In case it somehow slips through
            seq = seqs[i]
            qual = quals[i]
            goodLine = True

            # Check if line contains any disqualifying enteries
            for badstring in bad_strings_genomes:
                if header.find(badstring) > -1:
                    goodLine = False
                    break

            if goodLine:
                pos = header.find("chromosome")
                if pos > -1:
                    pos2 = header[pos:].find(" ")
                    pos3 = header[pos + pos2 + 1:].find(
                        " ")  # Looking for second space
                    if pos3 == -1:
                        new_header = "chr" + header[pos + pos2 + 1:]
                    else:
                        new_header = (
                            "chr" +
                            header[pos + pos2 + 1:pos + pos2 + 1 + pos3])
                elif header.find("chr") > -1:
                    # If we can find chr and not chromosome, assume that this header is as it should be
                    new_header = header
                else:
                    pos = header.find("mitochondrion")
                    if pos > -1:
                        new_header = "chrM"
                    else:
                        # This shouldn't happens
                        import pdb

                        pdb.set_trace()
                        raise Exception(
                            "Invalid DM genome header: %s!") % header

            if goodLine:
                if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]:
                    pgfile.write(">" + new_header + "\n")
                    pgfile.write(seq + "\n")
                elif file_extension.lower() in [".fq", ".fastq"]:
                    pgfile.write("@" + new_header + "\n")
                    pgfile.write(seq + "\n")
                    pgfile.write("+" + new_header + "\n")
                    pgfile.write(qual + "\n")
                else:
                    pgfile.write(r"@ERROR occured. File is NOT COMPLETE!")
                    raise Exception("Invalid file extension: %s" %
                                    file_extension)
Exemple #9
0
def TEST_SAM_TO_CONTIG(single_contig_file, contig_sam, output_alt_contig_fasta):
	[ctg_headers, ctg_seqs, ctg_quals] = fastqparser.read_fastq(single_contig_file);
	[headers, contig_sams] = utility_sam.LoadSAM(contig_sam);
	[new_contig, non_clipped_len, new_contig_cigar] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams);

	fp = open(output_alt_contig_fasta, 'w');
	fp.write('>Alternate contig\n');
	fp.write('%s\n' % new_contig);
	fp.close();
Exemple #10
0
def prepare_dm_genome(genome_file):
    filename, file_extension = os.path.ext(genome_file)
    processed_genome_file = filename + '_P' + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, 'w') as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = 'ERROR!'  # In case it somehow slips through
            seq = seqs[i]
            qual = quals[i]
            goodLine = True

            # Check if line contains any disqualifying enteries
            for badstring in bad_strings_genomes:
                if header.find(badstring) > -1:
                    goodLine = False
                    break

            if goodLine:
                pos = header.find('chromosome')
                if pos > -1:
                    pos2 = header[pos:].find(' ')
                    pos3 = header[pos + pos2 + 1:].find(
                        ' ')  # Looking for second space
                    if pos3 == -1:
                        new_header = 'chr' + header[pos + pos2 + 1:]
                    else:
                        new_header = 'chr' + header[pos + pos2 + 1:pos + pos2 +
                                                    1 + pos3]
                elif header.find('chr') > -1:
                    # If we can find chr and not chromosome, assume that this header is as it should be
                    new_header = header
                else:
                    pos = header.find('mitochondrion')
                    if pos > -1:
                        new_header = 'chrM'
                    else:
                        # This shouldn't happens
                        import pdb
                        pdb.set_trace()
                        raise Exception(
                            'Invalid DM genome header: %s!') % header

            if goodLine:
                if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                    pgfile.write('>' + new_header + '\n')
                    pgfile.write(seq + '\n')
                elif file_extension.lower() in ['.fq', '.fastq']:
                    pgfile.write('@' + new_header + '\n')
                    pgfile.write(seq + '\n')
                    pgfile.write('+' + new_header + '\n')
                    pgfile.write(qual + '\n')
                else:
                    pgfile.write(r'@ERROR occured. File is NOT COMPLETE!')
                    raise Exception('Invalid file extension: %s' %
                                    file_extension)
Exemple #11
0
def split(readsfile, namesfile):
    fname, fext = os.path.splitext(readsfile)

    readsfile1 = fname + '1' + fext
    readsfile2 = fname + '2' + fext

    file1 = open(readsfile1, 'w')
    file2 = open(readsfile2, 'w')

    [headers, seqs, quals] = read_fastq(readsfile)
    names = []
    nfile = open(namesfile, 'rU')
    for line in nfile:
        names.append(line[:-1])

    i = 0
    count1 = count2 = 0
    for i in range(len(headers)):
        header = headers[i]
        # Removing everything after the first space
        pos = header.find(' ')
        header = header[:pos]
        seq = seqs[i]
        qual = quals[i]
        if header in names:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                file1.write('>' + header + '\n')
                file1.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                file1.write('@' + header + '\n')
                file1.write(seq + '\n')
                file1.write('+' + header + '\n')
                file1.write(qual + '\n')
            else:
                raise Exception('Invalid extension for reads file: %s' % fext)
            count1 += 1
        else:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                file2.write('>' + header + '\n')
                file2.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                file2.write('@' + header + '\n')
                file2.write(seq + '\n')
                file2.write('+' + header + '\n')
                file2.write(qual + '\n')
            count2 += 1

        i += 1

    file1.close()
    file2.close()

    sys.stderr.write('\n%d reads in file1; %d reads n file2\n' %
                     (count1, count2))
Exemple #12
0
def split(readsfile, namesfile):
    fname, fext = os.path.splitext(readsfile)

    readsfile1 = fname + "1" + fext
    readsfile2 = fname + "2" + fext

    file1 = open(readsfile1, "w")
    file2 = open(readsfile2, "w")

    [headers, seqs, quals] = read_fastq(readsfile)
    names = []
    nfile = open(namesfile, "rU")
    for line in nfile:
        names.append(line[:-1])

    i = 0
    count1 = count2 = 0
    for i in range(len(headers)):
        header = headers[i]
        # Removing everything after the first space
        pos = header.find(" ")
        header = header[:pos]
        seq = seqs[i]
        qual = quals[i]
        if header in names:
            if fext.lower() in [".fa", ".fna", "faa", ".fasta"]:
                file1.write(">" + header + "\n")
                file1.write(seq + "\n")
            elif fext.lower() in [".fq", ".fastq"]:
                file1.write("@" + header + "\n")
                file1.write(seq + "\n")
                file1.write("+" + header + "\n")
                file1.write(qual + "\n")
            else:
                raise Exception("Invalid extension for reads file: %s" % fext)
            count1 += 1
        else:
            if fext.lower() in [".fa", ".fna", "faa", ".fasta"]:
                file2.write(">" + header + "\n")
                file2.write(seq + "\n")
            elif fext.lower() in [".fq", ".fastq"]:
                file2.write("@" + header + "\n")
                file2.write(seq + "\n")
                file2.write("+" + header + "\n")
                file2.write(qual + "\n")
            count2 += 1

        i += 1

    file1.close()
    file2.close()

    sys.stderr.write("\n%d reads in file1; %d reads n file2\n" %
                     (count1, count2))
Exemple #13
0
def get_sam_header(reference_file):
	[headers, seqs, quals] = fastqparser.read_fastq(reference_file);
	
	line = '';
	
	i = 0;
	while i < len(headers):
		line += '@SQ\tSN:%s\tLN:%d\n' % (headers[i], len(seqs[i]));
		i += 1;
	
	return line;
def get_sam_header(reference_file):
    [headers, seqs, quals] = fastqparser.read_fastq(reference_file)

    line = ''

    i = 0
    while i < len(headers):
        line += '@SQ\tSN:%s\tLN:%d\n' % (headers[i].split()[0], len(seqs[i]))
        i += 1

    return line
def prepare_dm_genome(genome_file):
    filename, file_extension = os.path.ext(genome_file)
    processed_genome_file = filename + '_P' + file_extension
    [headers, seqs, quals] = read_fastq(genome_file)

    with open(processed_genome_file, 'w') as pgfile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = 'ERROR!'       # In case it somehow slips through
            seq = seqs[i]
            qual = quals[i]
            goodLine = True

            # Check if line contains any disqualifying enteries
            for badstring in bad_strings_genomes:
                if header.find(badstring) > -1:
                    goodLine = False
                    break

            if goodLine:
                pos = header.find('chromosome')
                if pos > -1:
                    pos2 = header[pos:].find(' ')
                    pos3 = header[pos+pos2+1:].find(' ')    # Looking for second space
                    if pos3 == -1:
                        new_header = 'chr' + header[pos+pos2+1:]
                    else:
                        new_header = 'chr' + header[pos+pos2+1:pos+pos2+1+pos3]
                elif header.find('chr') > -1:
                    # If we can find chr and not chromosome, assume that this header is as it should be
                    new_header = header
                else:
                    pos = header.find('mitochondrion')
                    if pos > -1:
                        new_header = 'chrM'
                    else:
                        # This shouldn't happens
                        import pdb
                        pdb.set_trace()
                        raise Exception('Invalid DM genome header: %s!') % header

            if goodLine:
                if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                    pgfile.write('>' + new_header + '\n')
                    pgfile.write(seq + '\n')
                elif file_extension.lower() in ['.fq', '.fastq']:
                    pgfile.write('@' + new_header + '\n')
                    pgfile.write(seq + '\n')
                    pgfile.write('+' + new_header + '\n')
                    pgfile.write(qual + '\n')
                else:
                    pgfile.write(r'@ERROR occured. File is NOT COMPLETE!')
                    raise Exception('Invalid file extension: %s' % file_extension)
Exemple #16
0
def TEST_SAM_TO_CONTIG(single_contig_file, contig_sam,
                       output_alt_contig_fasta):
    [ctg_headers, ctg_seqs,
     ctg_quals] = fastqparser.read_fastq(single_contig_file)
    [headers, contig_sams] = utility_sam.LoadSAM(contig_sam)
    [new_contig, non_clipped_len, new_contig_cigar
     ] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams)

    fp = open(output_alt_contig_fasta, 'w')
    fp.write('>Alternate contig\n')
    fp.write('%s\n' % new_contig)
    fp.close()
def split(readsfile, namesfile):
    fname, fext = os.path.splitext(readsfile)

    readsfile1 = fname + '1' + fext
    readsfile2 = fname + '2' + fext

    file1 = open(readsfile1, 'w')
    file2 = open(readsfile2, 'w')

    [headers, seqs, quals] = read_fastq(readsfile)
    names = []
    nfile = open(namesfile, 'rU')
    for line in nfile:
        names.append(line[:-1])

    i = 0
    count1 = count2 = 0
    for i in range(len(headers)):
        header = headers[i]
        # Removing everything after the first space
        pos = header.find(' ')
        header = header[:pos]
        seq = seqs[i]
        qual = quals[i]
        if header in names:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                file1.write('>' + header + '\n')
                file1.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                file1.write('@' + header + '\n')
                file1.write(seq + '\n')
                file1.write('+' + header + '\n')
                file1.write(qual + '\n')
            else:
                raise Exception('Invalid extension for reads file: %s' % fext)
            count1 += 1
        else:
            if fext.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                file2.write('>' + header + '\n')
                file2.write(seq + '\n')
            elif fext.lower() in ['.fq', '.fastq']:
                file2.write('@' + header + '\n')
                file2.write(seq + '\n')
                file2.write('+' + header + '\n')
                file2.write(qual + '\n')
            count2 += 1

        i += 1

    file1.close()
    file2.close()

    sys.stderr.write('\n%d reads in file1; %d reads n file2\n' % (count1, count2))
Exemple #18
0
def ProcessFromFiles(reference_file, sam_path, out_accuracy_counts_path, count_indels_as_events=False):
	[ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file);
	references = {};
	accuracy_counts = [];
	
	i = 0;
	while i < len(ref_headers):
		header = ref_headers[i];
		seq = ref_seqs[i];
		references[header] = seq;
		references[header.split()[0]] = seq;
		i += 1;
	ProcessSAM(references, sam_path, out_accuracy_counts_path, count_indels_as_events);
def convert_blast_to_sam(reference_file, reads_file, blast_out_file, sam_file):
	sys.stderr.write('[%s wrapper] Converting BLAST output to SAM file...\n' % (MAPPER_NAME));

	[ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file);
	ref_header_hash = {};
	i = 0;
	while (i < len(ref_headers)):
		ref_header_hash[ref_headers[i]] = i;		ref_header_hash[ref_headers[i].split()[0]] = i;		i += 1;

	[read_headers, read_seqs, read_quals] = fastqparser.read_fastq(reads_file);	
	read_header_hash = {};
	i = 0;
	while (i < len(read_headers)):
		read_header_hash[read_headers[i]] = i;		read_header_hash[read_headers[i].split()[0]] = i;		i += 1;



	try:
		fp_in = open(blast_out_file, 'r');
	except Exception, e:
		sys.stderr.write('ERROR: Could not open file "%s" for reading!\n' % blast_out_file);
		exit(1);
Exemple #20
0
def load_and_process_sam(in_sam, ref_file, fp_out):
    [sam_headers, sam_lines] = parse_sam(in_sam);
    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file);

    seqs_ref_hash = {};
    for i in xrange(0, len(seqs_ref)):
        seqs_ref_hash[headers_ref[i]] = seqs_ref[i];
        seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i];

    for sam_line in sam_lines:
        if (sam_line.IsMapped() == False): continue;
        # print sam_line.verbose();
        seq_ref = seqs_ref_hash[sam_line.RefName];
        stablyLeftAlign(sam_line, seq_ref, 1, False);
Exemple #21
0
def load_and_process_sam(in_sam, ref_file, fp_out):
    [sam_headers, sam_lines] = parse_sam(in_sam)
    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file)

    seqs_ref_hash = {}
    for i in xrange(0, len(seqs_ref)):
        seqs_ref_hash[headers_ref[i]] = seqs_ref[i]
        seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i]

    for sam_line in sam_lines:
        if (sam_line.IsMapped() == False): continue
        # print sam_line.verbose();
        seq_ref = seqs_ref_hash[sam_line.RefName]
        stablyLeftAlign(sam_line, seq_ref, 1, False)
Exemple #22
0
def fixAfterRacon(consensus_file, original_file, output_file=sys.stdout):
    [cheaders, cseqs, cquals] = read_fastq(consensus_file)
    [oheaders, oseqs, oquals] = read_fastq(original_file)

    clen = len(cheaders)
    olen = len(oheaders)

    for oidx in xrange(olen):
        csame = 0
        oheader = oheaders[oidx]
        oseq = oseqs[oidx]
        for cidx in xrange(clen):
            cheader = cheaders[cidx]
            cseq = cseqs[cidx]
            if oheader == cheader[10:]:
                csame += 1
                # Write consensus sequence to output
                sys.stdout.write('>%s\n' % cheader)
                sys.stdout.write('%s\n' % cseq)

        if csame == 0:
            # Write original sequence to output
            sys.stdout.write('>%s\n' % oheader)
            sys.stdout.write('%s\n' % oseq)

        if csame > 1:
            sys.stderr.write(
                '\nFound an original with %d corresponding consensuses' %
                csame)
            sys.stderr.write('\n%s' % oheader)

    sys.stdout.write('\n')
    sys.stdout.write('\nNumber of sequences in original file: %d' % olen)
    sys.stdout.write('\nNumber of sequences in consensus file: %d' % clen)
    sys.stdout.write('\n')

    pass
Exemple #23
0
def get_kmers_from_positions(fastq_file, pos_list, k):
    kmers = []

    [headers, seqs, quals] = fastqparser.read_fastq(fastq_file)
    header_hash = {}
    i = 0
    while (i < len(headers)):
        header_hash[headers[i]] = i
        header_hash[headers[i].split()[0]] = i
        i += 1

    num_homo = 0

    i = 0
    for pos_item in pos_list:
        i += 1
        chrom = pos_item[0]
        pos = pos_item[1] - 1
        ref = pos_item[2]
        alt = pos_item[3]
        info = pos_item[4]
        try:
            seq = seqs[header_hash[chrom]]
        except Exception, e:
            sys.stderr.write(str(e) + '\n')
            continue
        # kstart = (pos - 1) - k/2;
        k_before = k
        k_after = k
        # klen =
        # kend = (pos - 1) + k/2;
        # kmer_before = seq[pos-k_before:pos] if (k_before <= pos) else (' ' * (k_before - pos) + seq[0:pos]);
        # kmer_after = seqs[(pos + 1):(pos+1+k_after)] if ((pos+1+k_after) >= len(seq)) else (seq[(pos+1):len(seq)] + (' ' * (pos+1+k_after - len(seq))));
        kmer_before = seq[pos - k:pos]
        kmer_after = seq[(pos + 1):(pos + 1 + k)]
        kmer = kmer_before + '_' + seq[pos] + '_' + kmer_after
        kmers.append([kmer, chrom, pos])

        kmer_ref = kmer_before + '_' + ref + '_' + kmer_after
        kmer_alt = kmer_before + '_' + alt + '_' + kmer_after

        num_homo += 1 if (kmer_before[-1] == ref
                          or kmer_after[0] == ref) else 0

        # if (kmer_before[-1] == ref or kmer_after[0] == ref):
        # 	sys.stdout.write('\th [%d] %s\t%s\t%s\t%d\t%s\n' % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info));
        # else:
        sys.stdout.write('[%d] %s\t%s\t%s\t%d\t%s\n' %
                         (i, kmer, kmer_ref, kmer_alt, (pos + 1), info))
def fixAfterRacon(consensus_file, original_file, output_file = sys.stdout):
    [cheaders, cseqs, cquals] = read_fastq(consensus_file)
    [oheaders, oseqs, oquals] = read_fastq(original_file)

    clen = len(cheaders)
    olen = len(oheaders)

    for oidx in xrange(olen):
        csame = 0
        oheader = oheaders[oidx]
        oseq = oseqs[oidx]
        for cidx in xrange(clen):
            cheader = cheaders[cidx]
            cseq = cseqs[cidx]
            if oheader == cheader[10:]:
                csame += 1
                # Write consensus sequence to output
                sys.stdout.write('>%s\n' % cheader)
                sys.stdout.write('%s\n' % cseq)

        if csame == 0:
            # Write original sequence to output
            sys.stdout.write('>%s\n' % oheader)
            sys.stdout.write('%s\n' % oseq)

        if csame > 1:
            sys.stderr.write('\nFound an original with %d corresponding consensuses' % csame)
            sys.stderr.write('\n%s' % oheader)


    sys.stdout.write('\n');
    sys.stdout.write('\nNumber of sequences in original file: %d' % olen);
    sys.stdout.write('\nNumber of sequences in consensus file: %d' % clen);
    sys.stdout.write('\n');

    pass
Exemple #25
0
def get_kmers_from_positions(fastq_file, pos_list, k):
    kmers = []

    [headers, seqs, quals] = fastqparser.read_fastq(fastq_file)
    header_hash = {}
    i = 0
    while i < len(headers):
        header_hash[headers[i]] = i
        header_hash[headers[i].split()[0]] = i
        i += 1

    num_homo = 0

    i = 0
    for pos_item in pos_list:
        i += 1
        chrom = pos_item[0]
        pos = pos_item[1] - 1
        ref = pos_item[2]
        alt = pos_item[3]
        info = pos_item[4]
        try:
            seq = seqs[header_hash[chrom]]
        except Exception, e:
            sys.stderr.write(str(e) + "\n")
            continue
            # kstart = (pos - 1) - k/2;
        k_before = k
        k_after = k
        # klen =
        # kend = (pos - 1) + k/2;
        # kmer_before = seq[pos-k_before:pos] if (k_before <= pos) else (' ' * (k_before - pos) + seq[0:pos]);
        # kmer_after = seqs[(pos + 1):(pos+1+k_after)] if ((pos+1+k_after) >= len(seq)) else (seq[(pos+1):len(seq)] + (' ' * (pos+1+k_after - len(seq))));
        kmer_before = seq[pos - k : pos]
        kmer_after = seq[(pos + 1) : (pos + 1 + k)]
        kmer = kmer_before + "_" + seq[pos] + "_" + kmer_after
        kmers.append([kmer, chrom, pos])

        kmer_ref = kmer_before + "_" + ref + "_" + kmer_after
        kmer_alt = kmer_before + "_" + alt + "_" + kmer_after

        num_homo += 1 if (kmer_before[-1] == ref or kmer_after[0] == ref) else 0

        # if (kmer_before[-1] == ref or kmer_after[0] == ref):
        # 	sys.stdout.write('\th [%d] %s\t%s\t%s\t%d\t%s\n' % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info));
        # else:
        sys.stdout.write("[%d] %s\t%s\t%s\t%d\t%s\n" % (i, kmer, kmer_ref, kmer_alt, (pos + 1), info))
Exemple #26
0
def ProcessFromFiles(reference_file,
                     sam_path,
                     out_accuracy_counts_path,
                     count_indels_as_events=False):
    [ref_headers, ref_seqs, ref_quals] = fastqparser.read_fastq(reference_file)
    references = {}
    accuracy_counts = []

    i = 0
    while i < len(ref_headers):
        header = ref_headers[i]
        seq = ref_seqs[i]
        references[header] = seq
        references[header.split()[0]] = seq
        i += 1
    ProcessSAM(references, sam_path, out_accuracy_counts_path,
               count_indels_as_events)
Exemple #27
0
def split_transcriptome(transcriptome_file):
    # split = {1: 4000, 2: 1000, 3: 1000}     # Split ratio
    # limits = [4000, 5000, 6000]
    split = split_sc
    limits = limits_sc

    filename, file_extension = os.path.splitext(transcriptome_file)
    g1_filename = filename + "_G1" + file_extension
    g2_filename = filename + "_G2" + file_extension
    g3_filename = filename + "_G3" + file_extension
    [headers, seqs, quals] = read_fastq(transcriptome_file)

    total = sum(split.values())
    if len(headers) > total:
        total = len(headers)

    random.seed()

    with open(g1_filename,
              "w") as g1file, open(g2_filename,
                                   "w") as g2file, open(g3_filename,
                                                        "w") as g3file:
        for i in range(len(headers)):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]
            rnum = random.randint(0, total)  # Generate random number
            gfile = None
            if rnum < limits[0]:
                gfile = g1file
            elif rnum < limits[1]:
                gfile = g2file
            elif rnum < limits[2]:
                gfile = g3file
            else:
                continue  # Skip this sequence

            if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]:
                gfile.write(">" + header + "\n")
                gfile.write(seq + "\n")
            elif file_extension.lower() in [".fq", ".fastq"]:
                gfile.write("@" + header + "\n")
                gfile.write(seq + "\n")
                gfile.write("+" + header + "\n")
                gfile.write(qual + "\n")
Exemple #28
0
def split_transcriptome(transcriptome_file):
    # split = {1: 4000, 2: 1000, 3: 1000}     # Split ratio
    # limits = [4000, 5000, 6000]
    split = split_sc
    limits = limits_sc

    filename, file_extension = os.path.splitext(transcriptome_file)
    g1_filename = filename + '_G1' + file_extension
    g2_filename = filename + '_G2' + file_extension
    g3_filename = filename + '_G3' + file_extension
    [headers, seqs, quals] = read_fastq(transcriptome_file)

    total = sum(split.values())
    if len(headers) > total:
        total = len(headers)

    random.seed()

    with open(g1_filename,
              'w') as g1file, open(g2_filename,
                                   'w') as g2file, open(g3_filename,
                                                        'w') as g3file:
        for i in xrange(len(headers)):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]
            rnum = random.randint(0, total)  # Generate random number
            gfile = None
            if rnum < limits[0]:
                gfile = g1file
            elif rnum < limits[1]:
                gfile = g2file
            elif rnum < limits[2]:
                gfile = g3file
            else:
                continue  # Skip this sequence

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                gfile.write('>' + header + '\n')
                gfile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                gfile.write('@' + header + '\n')
                gfile.write(seq + '\n')
                gfile.write('+' + header + '\n')
                gfile.write(qual + '\n')
Exemple #29
0
def get_circular_score(ref_path, contig_path, temp_folder):
    if (not os.path.exists(temp_folder)):
        os.makedirs(temp_folder)

    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path)

    circularized_fwd_path = '%s/circ-fwd.fa' % (temp_folder)
    circularized_rev_path = '%s/circ-rev.fa' % (temp_folder)

    fp_fwd = open(circularized_fwd_path, 'w')
    fp_rev = open(circularized_rev_path, 'w')

    for i in xrange(0, len(seqs_ref)):
        rev_seq = fastqparser.revcomp_seq(seqs_ref[i])
        rev_qual = quals_ref[i][::-1]
        # if (len(quals_ref) > 0):
        # 	fp_fwd.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], seqs_ref[i], seqs_ref[i], quals_ref[i], quals_ref[i]));
        # 	fp_rev.write('@%s\n%s%s\n+\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq, rev_qual, rev_qual));
        # else:
        fp_fwd.write('>%s\n%s%s\n' %
                     (headers_ref[i], seqs_ref[i], seqs_ref[i]))
        fp_rev.write('>%s\n%s%s\n' % (headers_ref[i], rev_seq, rev_seq))

    fp_fwd.close()
    fp_rev.close()

    # sys.stdout.write('Aligning the fwd orientation...\n');
    # command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path, circularized_fwd_path);
    # [rc_fwd, rstdout_fwd, rstderr_fwd] = execute_command_with_ret(DRY_RUN, command);
    # scores_fwd = parse_edlib_scores(rstdout_fwd);
    # for i in xrange(0, len(scores_fwd)):
    # 	sys.stdout.write('[%d] %d %s\n' % (i, scores_fwd[i], 'fwd'));
    # sys.stdout.write('\n');

    sys.stdout.write('Aligning the rev orientation...\n')
    command = '%s %s %s -m HW' % (EDLIB_PATH, contig_path,
                                  circularized_rev_path)
    [rc_rev, rstdout_rev,
     rstderr_rev] = execute_command_with_ret(DRY_RUN, command)
    scores_rev = parse_edlib_scores(rstdout_rev)
    for i in xrange(0, len(scores_rev)):
        sys.stdout.write('[%d] %d %s\n' % (i, scores_rev[i], 'rev'))
    sys.stdout.write('\n')
def adjustFqHeaders(fastqfile, findStr, replaceStr):
    # Reading fastq file
    [headers, seqs, quals] = read_fastq(fastqfile)
    filename, file_extension = os.path.splitext(fastqfile)

    totalSeqs = len(headers)
    findLen = len(findStr)
    replaceLen = len(replaceStr)
    replaced = 0
    notreplaced = 0

    for i in xrange(totalSeqs):
        header = headers[i]
        seq = seqs[i]  # Not really needed
        qual = quals[i]  # Not really needed

        if header[:findLen] == findStr:
            newheader = replaceStr + header[findLen:]
            headers[i] = newheader
            replaced += 1
        else:
            notreplaced += 1

    with open(fastqfile, 'w') as ffile:
        for i in xrange(totalSeqs):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                ffile.write('>' + header + '\n')
                ffile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                ffile.write('@' + header + '\n')
                ffile.write(seq + '\n')
                ffile.write('+' + header + '\n')
                ffile.write(qual + '\n')
            else:
                ffile.write(r'@ERROR occured. File is NOT COMPLETE!')
                raise Exception('Invalid file extension: %s' % file_extension)

    return replaced, notreplaced
def split_transcriptome(transcriptome_file):
    # split = {1: 4000, 2: 1000, 3: 1000}     # Split ratio
    # limits = [4000, 5000, 6000]
    split = split_sc
    limits = limits_sc

    filename, file_extension = os.path.splitext(transcriptome_file)
    g1_filename = filename + '_G1' + file_extension
    g2_filename = filename + '_G2' + file_extension
    g3_filename = filename + '_G3' + file_extension
    [headers, seqs, quals] = read_fastq(transcriptome_file)

    total = sum(split.values())
    if len(headers) > total:
        total = len(headers)

    random.seed()

    with open(g1_filename, 'w') as g1file, open(g2_filename, 'w') as g2file, open(g3_filename, 'w') as g3file:
        for i in xrange(len(headers)):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]
            rnum = random.randint(0, total)     # Generate random number
            gfile = None
            if rnum < limits[0]:
                gfile = g1file
            elif rnum < limits[1]:
                gfile = g2file
            elif rnum < limits[2]:
                gfile = g3file
            else:
                continue       # Skip this sequence

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                gfile.write('>' + header + '\n')
                gfile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                gfile.write('@' + header + '\n')
                gfile.write(seq + '\n')
                gfile.write('+' + header + '\n')
                gfile.write(qual + '\n')
def adjustFqHeaders(fastqfile, findStr, replaceStr):
    # Reading fastq file
    [headers, seqs, quals] = read_fastq(fastqfile)
    filename, file_extension = os.path.splitext(fastqfile)

    totalSeqs = len(headers)
    findLen = len(findStr)
    replaceLen = len(replaceStr)
    replaced = 0
    notreplaced = 0

    for i in xrange(totalSeqs):
        header = headers[i]
        seq = seqs[i]               # Not really needed
        qual = quals[i]             # Not really needed

        if header[:findLen] == findStr:
            newheader = replaceStr + header[findLen:]
            headers[i] = newheader
            replaced += 1
        else:
            notreplaced += 1

    with open(fastqfile, 'w') as ffile:
        for i in xrange(totalSeqs):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                ffile.write('>' + header + '\n')
                ffile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                ffile.write('@' + header + '\n')
                ffile.write(seq + '\n')
                ffile.write('+' + header + '\n')
                ffile.write(qual + '\n')
            else:
                ffile.write(r'@ERROR occured. File is NOT COMPLETE!')
                raise Exception('Invalid file extension: %s' % file_extension)

    return replaced, notreplaced
def adjustFqHeaders(fastqfile, findStr, replaceStr):
    # Reading fastq file
    [headers, seqs, quals] = read_fastq(fastqfile)
    filename, file_extension = os.path.splitext(fastqfile)

    totalSeqs = len(headers)
    findLen = len(findStr)
    replaceLen = len(replaceStr)
    replaced = 0
    notreplaced = 0

    for i in range(totalSeqs):
        header = headers[i]
        seq = seqs[i]  # Not really needed
        qual = quals[i]  # Not really needed

        if header[:findLen] == findStr:
            newheader = replaceStr + header[findLen:]
            headers[i] = newheader
            replaced += 1
        else:
            notreplaced += 1

    with open(fastqfile, "w") as ffile:
        for i in range(totalSeqs):
            header = headers[i]
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]:
                ffile.write(">" + header + "\n")
                ffile.write(seq + "\n")
            elif file_extension.lower() in [".fq", ".fastq"]:
                ffile.write("@" + header + "\n")
                ffile.write(seq + "\n")
                ffile.write("+" + header + "\n")
                ffile.write(qual + "\n")
            else:
                ffile.write(r"@ERROR occured. File is NOT COMPLETE!")
                raise Exception("Invalid file extension: %s" % file_extension)

    return replaced, notreplaced
Exemple #34
0
def process_sam_on_the_fly(in_sam, ref_file, fp_out):
    if (not os.path.exists(in_sam)):
        sys.stderr.write('ERROR: File "%s" does not exist!\n' % (in_sam));
        exit(1);
    if (not os.path.exists(ref_file)):
        sys.stderr.write('ERROR: File "%s" does not exist!\n' % (ref_file));
        exit(1);

    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file);
    seqs_ref_hash = {};
    for i in xrange(0, len(seqs_ref)):
        seqs_ref_hash[headers_ref[i]] = seqs_ref[i];
        seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i];

    try:
        fp_in = open(in_sam, 'r');
    except IOError, e:
        sys.stderr.write('ERROR: Could not open file %s for reading! Exiting.\n' % (in_sam));
        sys.stderr.write(str(e));
        exit(1);
Exemple #35
0
def process_sam_on_the_fly(in_sam, ref_file, fp_out):
    if (not os.path.exists(in_sam)):
        sys.stderr.write('ERROR: File "%s" does not exist!\n' % (in_sam))
        exit(1)
    if (not os.path.exists(ref_file)):
        sys.stderr.write('ERROR: File "%s" does not exist!\n' % (ref_file))
        exit(1)

    [headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_file)
    seqs_ref_hash = {}
    for i in xrange(0, len(seqs_ref)):
        seqs_ref_hash[headers_ref[i]] = seqs_ref[i]
        seqs_ref_hash[headers_ref[i].split()[0]] = seqs_ref[i]

    try:
        fp_in = open(in_sam, 'r')
    except IOError, e:
        sys.stderr.write(
            'ERROR: Could not open file %s for reading! Exiting.\n' % (in_sam))
        sys.stderr.write(str(e))
        exit(1)
Exemple #36
0
def expandHeader(fastfile, sstring):
    filename, file_extension = os.path.splitext(fastfile)
    [headers, seqs, quals] = read_fastq(fastfile)

    with open(fastfile, 'w') as ffile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = sstring + header
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                ffile.write('>' + new_header + '\n')
                pgfffileile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                ffile.write('@' + new_header + '\n')
                ffile.write(seq + '\n')
                ffile.write('+' + new_header + '\n')
                ffile.write(qual + '\n')
            else:
                ffile.write(r'@ERROR occured. File is NOT COMPLETE!')
                raise Exception('Invalid file extension: %s' % file_extension)
Exemple #37
0
def expandHeader(fastfile, sstring):
    filename, file_extension = os.path.splitext(fastfile)
    [headers, seqs, quals] = read_fastq(fastfile)

    with open(fastfile, "w") as ffile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = sstring + header
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in [".fa", ".fna", "faa", ".fasta"]:
                ffile.write(">" + new_header + "\n")
                pgfffileile.write(seq + "\n")
            elif file_extension.lower() in [".fq", ".fastq"]:
                ffile.write("@" + new_header + "\n")
                ffile.write(seq + "\n")
                ffile.write("+" + new_header + "\n")
                ffile.write(qual + "\n")
            else:
                ffile.write(r"@ERROR occured. File is NOT COMPLETE!")
                raise Exception("Invalid file extension: %s" % file_extension)
def expandHeader(fastfile, sstring):
    filename, file_extension = os.path.splitext(fastfile)
    [headers, seqs, quals] = read_fastq(fastfile)

    with open(fastfile, 'w') as ffile:
        for i in range(len(headers)):
            header = headers[i]
            new_header = sstring + header
            seq = seqs[i]
            qual = quals[i]

            if file_extension.lower() in ['.fa', '.fna', 'faa', '.fasta']:
                ffile.write('>' + new_header + '\n')
                pgfffileile.write(seq + '\n')
            elif file_extension.lower() in ['.fq', '.fastq']:
                ffile.write('@' + new_header + '\n')
                ffile.write(seq + '\n')
                ffile.write('+' + new_header + '\n')
                ffile.write(qual + '\n')
            else:
                ffile.write(r'@ERROR occured. File is NOT COMPLETE!')
                raise Exception('Invalid file extension: %s' % file_extension)
Exemple #39
0
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''):
    # Sparse also runs only on fasta
    # Atm parameters are hardcoded.
    # TODO: if fastq is given convert it to fasta
    #       callculate estimated genome size (GS) from reference and/or reads files
    num_threads = multiprocessing.cpu_count() / 2

    # ATM using the same set of parametars for all sequencers
    if machine_name in basicdefines.TECH:

        genomesize = 60000000       # Starting value / historical reasons

        # Calculating reference size
        reference_fastq = fastqparser.read_fastq(reference_file)
        reference_seq = reference_fastq[1][0]
        genomesize = len(reference_seq)

        memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime')
        command = 'cd %s; %s %s -t %d k 21 GS %d f %s' % (output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, 10*genomesize, reads_file)
        subprocess.call(command, shell='True')
    else:
        sys.stderr.write('\}\nInvalid machine_name parameter for assembler %s' % ASSEMBLER_NAME)
        sys.stderr.write('\nSkipping ....')
# Looking through MSA
for i in xrange(numseq):
    seqname = lines[i * 2 + 1][1:-1]
    seqalign = lines[i * 2 + 2][:-1]

    base = seqalign[position]
    if base == '-':
        base = 'N'
    if base in reads_dict:
        reads_dict[base].append(seqname)
    else:
        reads_dict[base] = [seqname]

# Loading reads
[headers, seqs, quals] = read_fastq(reads_filename)
r_fname, r_fext = os.path.splitext(os.path.basename(reads_filename))

# Separating reads into files, but only if there is a sufficient number of them!
max_coverage = numseq
factor = 0.2
for base, readname_list in reads_dict.iteritems():
    if len(readname_list) < factor * max_coverage:
        continue
    sep_filename = os.path.join(results_folder, r_fname + '_' + base + r_fext)
    file = open(sep_filename, 'w')
    for i in xrange(len(headers)):
        header = headers[i]
        seq = seqs[i]
        qual = quals[i]
Exemple #41
0
def processData(datafolder, resultfile, annotationfile, Array, SS_list,
                csv_path):
    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = load_and_process_SAM(resultfile, BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
            #sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename)
        else:
            #annotation_dict[annotation.genename] = annotation
            annotation_dict[annotation.transcriptname] = annotation

    #***********************************
    #***********************************
    static_dict = {}
    #"A": with exon < 30 "B": exon > 30
    #"C": single splicing "D": alternative splicing
    # key = ["All", "A", "B", "C", "D", "E", "F", "G"]
    key = ["All", "A", "B", "C", "D"]
    for i in range(len(key)):
        static_dict[key[i]] = Static()

    ss_array = list()
    with open(SS_list, 'r') as f_ss:
        for line in f_ss:
            ss_array.append(line.strip())
    #**********************************

    allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY  # Allowing some shift in positions
    # Setting allowed inaccuracy
    # allowed_inacc = 25

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]
        # print(simFolderKey)
        # print(simFolder)
        # print(simQName)

        simFileSuffix = 'SimG2_S'

        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception(
                'Invalid simulated query name in results file (%s)!' %
                simQName)

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 != -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        # simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Reference file for simulated read %s does not exist!' % qname)
        #if not os.path.exists(simSeqFilePath):
        #    raise Exception('Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        # if "transcript" in simGeneName:
        #     simGeneName = simGeneName.split(':')[1]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        #---------------------
        #for i in range(len(annotation.items)):
        #    print "(%d,%d)" %(annotation.items[i].start, annotation.items[i].end)

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        maf_reflen = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_strand = elements[4]
                        maf_reflen = int(int(elements[5]) / 3)
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            print("maf_qname = %s, simQName = %s" % (maf_qname, simQName))
            raise Exception('ERROR: could not find query %s in maf file %s' %
                            (qname, simMafFileName))

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen * 3 - maf_length - maf_startpos
            if maf_startpos > maf_reflen * 2:
                maf_startpos = maf_startpos - maf_reflen * 2
            elif maf_startpos > maf_reflen:
                maf_startpos = maf_startpos - maf_reflen

        # Calculating expected partial alignmetns from MAF and annotations
        sigA = False
        sigB = True
        sigC = False
        sigD = False

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        flag_wrong = 0
        while annotation.items[i].getLength() <= maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1
            if len(annotation.items) == i:
                flag_wrong = 1
                break
        if flag_wrong == 1:
            continue

        # Calculating expected partial alignments by filling up exons using maf_length
        maf_length = int(maf_length / 3)
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end

            #    print "(%d, %d)" %(start, end)

            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
                if len(annotation.items) == i:
                    maf_length = 0
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0
        #*****************************************
        #*****************************************
        #level2
        for ele in expected_partial_alignments[1:-1]:
            if ele[1] - ele[0] < 30:
                sigA = True
                sigB = False
                break

        #level4
        n = len(expected_partial_alignments)

        #level3
        if simGeneName in ss_array:
            sigC = True
        else:
            sigD = True

        if DEBUG:
            print("exon in expected alignment---------------")
            for i in range(len(expected_partial_alignments)):
                print("(%d, %d)" % (expected_partial_alignments[i][0],
                                    expected_partial_alignments[i][1]))
            print("exon in real alignment-------------")

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parteqmap = {(i + 1): 0 for i in range(numparts)}
        parthitmap = {(i + 1): 0 for i in range(numparts)}

        if getChromName(samline_list[0].rname) != getChromName(
                annotation.seqname):
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1)
        else:
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                readlength = samline.CalcReadLengthFromCigar()
                #************************
                #************************
                sl_endpos = sl_startpos + reflength

                if DEBUG:
                    print("(%d, %d)" % (sl_startpos, sl_endpos))

                # Comparing a samline to all expected partial alignments
                tmp_aln = 0
                for i in range(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if numparts > 2 and i == 0 and abs(
                            sl_endpos - maf_endpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif numparts > 2 and (
                            i == len(expected_partial_alignments) - 1
                    ) and abs(sl_startpos - maf_startpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_equals((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc):
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_overlaps((sl_startpos, sl_endpos),
                                           (maf_startpos, maf_endpos), 5):
                        parthitmap[i + 1] += 1

                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos), 5):
                        l = basesInside(sl_startpos, sl_endpos, maf_startpos,
                                        maf_endpos)
                        if tmp_aln < l:
                            tmp_aln = l
                if tmp_aln > readlength:
                    tmp_aln = readlength
                static_dict["All"].Total_aligned_bases += tmp_aln
                part_cal.cal(static_dict, sigA, sigC, "Total_aligned_bases",
                             tmp_aln)

            #*************************************************************************************
            #*************************************************************************************
            num_recover_exons = len([x for x in parteqmap.values() if x == 1])
            num_hit_exons = len([x for x in parthitmap.values() if x == 1])

            if num_hit_exons == numparts:
                static_dict["All"].Hit100 += 1
                part_cal.cal(static_dict, sigA, sigC, "Hit100", 1)
            if num_hit_exons >= int(0.8 * numparts):
                static_dict["All"].Hit80 += 1
                part_cal.cal(static_dict, sigA, sigC, "Hit80", 1)

            sam_l = len(samline_list)
            if num_recover_exons == numparts:
                static_dict["All"].ExR100 += 1
                part_cal.cal(static_dict, sigA, sigC, "ExR100", 1)
                if num_recover_exons == sam_l:
                    static_dict["All"].ExA100 += 1
                    part_cal.cal(static_dict, sigA, sigC, "ExA100", 1)
            if num_recover_exons >= int(0.8 * numparts):
                static_dict["All"].ExR80 += 1
                part_cal.cal(static_dict, sigA, sigC, "ExR80", 1)
                if num_recover_exons >= int(0.8 * sam_l):
                    static_dict["All"].ExA80 += 1
                    part_cal.cal(static_dict, sigA, sigC, "ExA80", 1)
            static_dict["All"].Total_aligned_exons += num_recover_exons
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_exons",
                         num_recover_exons)
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1)
            #*************************************************************************************

    #************************************************
    #******************************************write csv
    static_dict["All"].Total_reads = Array.Total_reads
    static_dict["All"].Total_bases = Array.Total_bases
    static_dict["All"].Total_expected_exons = Array.Total_expected_exons
    static_dict["A"].Total_reads = Array.Total_level2_reads
    static_dict["A"].Total_bases = Array.Total_level2_bases
    static_dict["A"].Total_expected_exons = Array.Total_level2_expected_exons
    static_dict["B"].Total_reads = Array.Total_level2_r_reads
    static_dict["B"].Total_bases = Array.Total_level2_r_bases
    static_dict["B"].Total_expected_exons = Array.Total_level2_r_expected_exons
    static_dict["C"].Total_reads = Array.Total_level3_SS_reads
    static_dict["C"].Total_bases = Array.Total_level3_SS_bases
    static_dict[
        "C"].Total_expected_exons = Array.Total_level3_SS_expected_exons
    static_dict["D"].Total_reads = Array.Total_level3_AS_reads
    static_dict["D"].Total_bases = Array.Total_level3_AS_bases
    static_dict[
        "D"].Total_expected_exons = Array.Total_level3_AS_expected_exons
    # static_dict["E"].Total_reads = Array.Total_level4_2_5_reads
    # static_dict["E"].Total_bases = Array.Total_level4_2_5_bases
    # static_dict["E"].Total_expected_exons = Array.Total_level4_2_5_expected_exons
    # static_dict["F"].Total_reads = Array.Total_level4_6_9_reads
    # static_dict["F"].Total_bases = Array.Total_level4_6_9_bases
    # static_dict["F"].Total_expected_exons = Array.Total_level4_6_9_expected_exons
    # static_dict["G"].Total_reads = Array.Total_level4_10_reads
    # static_dict["G"].Total_bases = Array.Total_level4_10_bases
    # static_dict["G"].Total_expected_exons = Array.Total_level4_10_expected_exons

    with open(csv_path, "w") as fw:
        csv_write = csv.writer(fw, dialect='excel')
        header = [" ", resultfile]
        csv_write.writerow(header)
        for item in key:
            level = [
                item,
                str(static_dict[item].Total_reads) + ' reads/' +
                str(static_dict[item].Total_bases) + ' bases/' +
                str(static_dict[item].Total_expected_exons) + ' exons'
            ]
            row1 = [
                "Aligned",
                round(
                    100 * static_dict[item].Total_aligned_reads /
                    float(static_dict[item].Total_reads), 2)
            ]
            row2 = [
                "bases%",
                round(
                    100 * static_dict[item].Total_aligned_bases /
                    float(static_dict[item].Total_bases), 2)
            ]
            #line = str(round(100*static_dict[item].ExR100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].ExR80/float(static_dict[item].Total_reads), 2))
            #row3 = ["ExR100/80%", line]
            line = str(
                round(
                    100 * static_dict[item].ExA100 /
                    float(static_dict[item].Total_reads), 2)) + '/' + str(
                        round(
                            100 * static_dict[item].ExA80 /
                            float(static_dict[item].Total_reads), 2))
            row4 = ["Read100/80%", line]
            #line = str(round(100*static_dict[item].Hit100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].Hit80/float(static_dict[item].Total_reads), 2))
            #row5 = ["Hit100/80%", line]
            row6 = [
                "Exons%",
                round(
                    100 * static_dict[item].Total_aligned_exons /
                    float(static_dict[item].Total_expected_exons), 2)
            ]
            csv_write.writerow(level)
            csv_write.writerow(row1)
            csv_write.writerow(row2)
            #csv_write.writerow(row3)
            csv_write.writerow(row4)
            #csv_write.writerow(row5)
            csv_write.writerow(row6)
Exemple #42
0
def processData(datafolder, resultfile, annotationfile, paramdict):

    split_qnames = False
    filename = ""
    if "--split-qnames" in paramdict:
        split_qnames = True
        filename = paramdict["--split-qnames"][0]

    filename_correct = filename + "_correct.names"
    filename_hitall = filename + "_hitall.names"
    filename_hitone = filename + "_hitone.names"
    filename_bad = filename + "_incorrect.names"
    filename_unmapped = filename + "_unmapped.names"

    printMap = False
    filename_mapping = ""
    if "--print_mapping" in paramdict:
        filename_mapping = paramdict["--print_mapping"][0]
        printMap = True

    file_correct = None
    file_hitall = None
    file_hitone = None
    file_bad = None
    file_unmapped = None
    folder = os.getcwd()

    # If splittng qnames into files, have to open files first
    if split_qnames:
        file_correct = open(os.path.join(folder, filename_correct), "w+")
        file_hitall = open(os.path.join(folder, filename_hitall), "w+")
        file_hitone = open(os.path.join(folder, filename_hitone), "w+")
        file_bad = open(os.path.join(folder, filename_bad), "w+")

    # Loading results SAM file
    report = EvalReport(ReportType.FASTA_REPORT
                        )  # not really needed, used for unmapped query names
    # Have to preserve the paramdict
    # paramdict = {}

    sys.stderr.write(
        "\n(%s) Loading and processing SAM file with mappings ... " %
        datetime.now().time().isoformat())
    all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile,
                                                    paramdict,
                                                    report,
                                                    BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    s_num_multiexon_genes = 0

    mapfile = None
    if printMap:
        mapfile = open(filename_mapping, "w+")

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.genename in annotation_dict:
            sys.stderr.write(
                "\nWARNING: anotation with name %s already in the dictionary!"
                % annotation.genename)
        else:
            annotation_dict[annotation.genename] = annotation
        if len(annotation.items) > 1:
            s_num_multiexon_genes += 1

    # Statistical information for evaluating the qualitiy of mapping
    s_gene_hits = 0
    s_gene_misses = 0
    s_whole_alignment_hits = 0
    s_whole_alignment_misses = 0
    s_partial_alignment_hits = 0
    s_partial_alignment_misses = 0
    s_num_start_hits = 0
    s_num_end_hits = 0
    s_num_start_end_hits = 0

    s_num_fw_strand = 0
    s_num_rv_strand = 0

    s_num_split_alignment = 0
    s_num_oversplit_alignment = 0  # Alignments that have more parts than exons

    s_num_good_alignments = 0

    s_num_badchrom_alignments = 0

    s_maf_suspicious_alignments = 0
    s_maf_bad_alignments = 0
    s_maf_good_alignments = 0

    s_maf_split_reads = 0
    s_maf_good_split_alignments = 0
    s_maf_bad_split_alignments = 0

    s_maf_hit_all_parts = 0
    s_maf_hit_one_part = 0
    s_maf_eq_one_part = 0
    s_maf_multihit_parts = 0

    s_maf_split_hit_all_parts = 0
    s_maf_split_hit_one_part = 0
    s_maf_split_eq_one_part = 0

    s_maf_miss_alignment = 0
    s_maf_too_many_alignments = 0

    s_num_potential_bad_strand = 0

    allowed_inacc = (Annotation_formats.DEFAULT_ALLOWED_INACCURACY
                     )  # Allowing some shift in positions
    min_overlap = (Annotation_formats.DEFAULT_MINIMUM_OVERLAP
                   )  # Minimum overlap that is considered

    # Setting allowed_inaccuracy from parameters
    if "--allowed_inacc" in paramdict:
        allowed_inacc = int(paramdict["--allowed_inacc"][0])
    elif "-ai" in paramdict:
        allowed_inacc = int(paramdict["-ai"][0])

    # Setting minimum overlap from parameters
    if "--allowed_inacc" in paramdict:
        min_overlap = int(paramdict["--allowed_inacc"][0])
    elif "-mo" in paramdict:
        min_overlap = int(paramdict["-mo"][0])

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        isSplitAlignment = False
        if len(samline_list) > 1:
            s_num_split_alignment += 1
            isSplitAlignment = True

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    "\nWARNING: two samlines in the same list with different query names (%s/%s)"
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find("_")
        if pos < 0:
            raise Exception("Invalid query name in results file (%s)!" % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception("Bad simulation folder short name (%s)!" %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]

        # Due to error in data preparation, have to make some extra processing
        if simQName[:6] == "SimG2_":
            simQName = simQName[6:]

        #        if simFolderKey == 'SimG1':
        #            simFileSuffix = 'g1'
        #        elif simFolderKey == 'SimG2':
        #            simFileSuffix = 'g2'
        #        elif simFolderKey == 'SimG3':
        #            simFileSuffix = 'g3'
        #        else:
        #            simFileSuffix = 'sd'

        simFileSuffix = "sd"

        pos = simQName.find("_")
        pos2 = simQName.find("_part")
        if pos < 0:
            raise Exception(
                "Invalid simulated query name in results file (%s)!" %
                simQName)

        simQLetter = simQName[0]  # Should always be S

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 != -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simQNumber = int(simQName[pos + 1:])
        simFileName = simFileSuffix + "_%04d" % simRefNumber
        simRefFileName = simFileName + ".ref"
        simSeqFileName = simFileName + ".fastq"
        simMafFileName = simFileName + ".maf"

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                "Reference file for simulated read %s does not exist!" % qname)
        if not os.path.exists(simSeqFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                "Sequence file for simulated read %s does not exist!" % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                "Sequence alignment (MAF) for simulated read %s does not exist!"
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        if len(samline_list) > len(annotation.items):
            # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items)))
            s_num_oversplit_alignment += 1

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        maf_strand = "0"
        maf_reflen = 0
        i = 0
        with open(simMafFilePath, "rU") as maffile:
            i += 1
            for line in maffile:
                if line[0] == "s":
                    elements = line.split()
                    maf_qname = elements[1]
                    if (
                            maf_qname == "ref"
                    ):  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_strand = elements[4]
                        maf_reflen = int(elements[5])
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            raise Exception("ERROR: could not find query %s in maf file %s" %
                            (qname, simMafFileName))

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen - maf_length - maf_startpos

        # Saving "maf_length" and "maf_startpos" to be able to check it later
        t_maf_length = maf_length
        t_maf_startpos = maf_startpos

        # Calculating expected partial alignmetns from MAF and annotations

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() < maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end

            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0

        # import pdb
        # pdb.set_trace()

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parthitmap = {(i + 1): 0 for i in range(numparts)}
        parteqmap = {(i + 1): 0 for i in range(numparts)}

        isSplitRead = False
        if len(expected_partial_alignments) > 1:
            s_maf_split_reads += 1
            isSplitRead = True

        oneHit = False
        allHits = False
        oneEq = False
        multiHit = False
        good_alignment = False
        has_miss_alignments = False

        if RNAseqEval.getChromName(
                samline_list[0].rname) != RNAseqEval.getChromName(
                    annotation.seqname):
            # import pdb
            # pdb.set_trace()
            s_num_badchrom_alignments += 1
        else:
            if len(samline_list) != len(expected_partial_alignments):
                # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname)
                s_maf_suspicious_alignments += 1
            # import pdb
            # pdb.set_trace()

            good_alignment = True
            k = 0
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                sl_endpos = sl_startpos + reflength

                # Comparing a samline to the corresponding expected partial alignment
                if k < len(expected_partial_alignments):
                    expected_alignement = expected_partial_alignments[k]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]
                    if (abs(sl_startpos - maf_startpos) > allowed_inacc
                            or abs(sl_endpos - maf_endpos) > allowed_inacc):
                        good_alignment = False
                else:
                    good_alignment = False
                k += 1

                # Comparing a samline to all expected partial alignments
                for i in range(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if interval_equals(
                        (sl_startpos, sl_endpos),
                        (maf_startpos, maf_endpos),
                            allowed_inacc,
                            min_overlap,
                    ):
                        parteqmap[i + 1] += 1
                    if interval_overlaps(
                        (sl_startpos, sl_endpos),
                        (maf_startpos, maf_endpos),
                            allowed_inacc,
                            min_overlap,
                    ):
                        parthitmap[i + 1] += 1

            has_miss_alignments = False
            for expected_alignement in expected_partial_alignments:
                maf_startpos = expected_alignement[0]
                maf_endpos = expected_alignement[1]
                overlap = False
                for samline in samline_list:
                    sl_startpos = samline.pos
                    reflength = samline.CalcReferenceLengthFromCigar()
                    sl_endpos = sl_startpos + reflength
                    if interval_overlaps(
                        (sl_startpos, sl_endpos),
                        (maf_startpos, maf_endpos),
                            allowed_inacc,
                            min_overlap,
                    ):
                        overlap = True
                if not overlap:
                    has_miss_alignments = True
                    break

            if len(samline_list) < len(expected_partial_alignments):
                s_maf_too_many_alignments += 1

            # Testing the evaluation process
            # import pdb
            # pdb.set_trace()
            if len(samline_list) != len(expected_partial_alignments):
                good_alignment = False

            if good_alignment:
                s_maf_good_alignments += 1

                # Writting qnames to files
                if split_qnames:
                    file_correct.write(samline_list[0].qname + "\n")

                if isSplitRead:
                    s_maf_good_split_alignments += 1
            else:
                # import pdb
                # pdb.set_trace()
                s_maf_bad_alignments += 1
                if isSplitRead:
                    s_maf_bad_split_alignments += 1
                # TODO: check which alignments are bad and why
                # If the choromosome is different its obviously a bad alignment
                if RNAseqEval.getChromName(
                        samline.rname) == RNAseqEval.getChromName(
                            annotation.seqname):
                    # import pdb
                    # pdb.set_trace()
                    pass
                else:
                    s_num_badchrom_alignments += 1

            # Analyzing parthitmap and parteqmap
            oneHit = False
            allHits = True
            oneEq = False
            multiHit = False
            for i in range(numparts):
                if parthitmap[i + 1] > 0:
                    oneHit = True
                if parthitmap[i + 1] == 0:
                    allHits = False
                if parthitmap[i + 1] > 1:
                    multiHit = True
                if parteqmap[i + 1] > 0:
                    oneEq = True

        if printMap:
            status = "INCORRECT"
            if good_alignment:
                status = "CORRECT"
            elif allHits:
                status = "HITALL"
            elif oneHit:
                status = "HITONE"
            mapfile.write("QNAME: %s, STATUS: %s\n\n" %
                          (samline_list[0].qname, status))
            mapfile.write("EXPECTED (%s, %s):\t" % (RNAseqEval.getChromName(
                annotation.seqname), annotation.strand))
            for epa in expected_partial_alignments:
                mapfile.write("(%d, %d)\t" % (epa[0], epa[1]))
            mapfile.write("\n")
            if samline_list[0].flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
            mapfile.write(
                "ACTUAL   (%s, %s):\t" %
                (RNAseqEval.getChromName(samline_list[0].rname), readstrand))
            for samline in samline_list:
                mapfile.write("(%d, %d)\t" % (
                    samline.pos,
                    samline.pos + samline.CalcReferenceLengthFromCigar(),
                ))
            mapfile.write("\n\n")

        if oneHit:
            s_maf_hit_one_part += 1
            if isSplitRead:
                s_maf_split_hit_one_part += 1

            # Writting qnames to files
            if split_qnames:
                file_hitone.write(samline_list[0].qname + "\n")

            if not allHits:
                if "--debug" in paramdict:
                    import pdb

                    pdb.set_trace()

            # Misses are calculated only for alignments that have at least one hit
            if has_miss_alignments:
                s_maf_miss_alignment += 1

        else:
            # Writting qnames to files
            if split_qnames:
                file_bad.write(samline_list[0].qname + "\n")

            # if '--debug' in paramdict:
            #     import pdb
            #     pdb.set_trace()

        if allHits:
            s_maf_hit_all_parts += 1
            if isSplitRead:
                s_maf_split_hit_all_parts += 1

            # Writting qnames to files
            if split_qnames:
                file_hitall.write(samline_list[0].qname + "\n")

        # Sanity check
        if "--debug" in paramdict and good_alignment and not allHits:
            import pdb

            pdb.set_trace()
            pass

        if oneEq:
            s_maf_eq_one_part += 1
            if isSplitRead:
                s_maf_split_eq_one_part += 1
        if multiHit:
            s_maf_multihit_parts += 1

        num_start_hits = 0
        num_end_hits = 0
        num_hits = 0

        num_partial_alignements = len(samline_list)
        whole_alignment_hit = False
        for samline in samline_list:
            startpos = samline.pos - 1
            reflength = samline.CalcReferenceLengthFromCigar()
            endpos = startpos + reflength

            if samline.flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
                s_num_fw_strand += 1
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
                s_num_rv_strand += 1

            chromname = RNAseqEval.getChromName(samline.rname)

            if (chromname == RNAseqEval.getChromName(annotation.seqname)
                    and readstrand != annotation.strand
                    and annotation.overlapsGene(startpos, endpos)):
                s_num_potential_bad_strand += 1

            if (chromname == RNAseqEval.getChromName(annotation.seqname)
                    and annotation.overlapsGene(startpos, endpos) and
                (not P_CHECK_STRAND or readstrand == annotation.strand)):
                whole_alignment_hit = True
                s_partial_alignment_hits += 1
            else:
                s_partial_alignment_misses += 1

            # Checking how well partial alignments match exons
            startsItem = False
            endsItem = False
            for item in annotation.items:
                if item.overlapsItem(startpos, endpos):
                    num_hits += 1
                if item.startsItem(startpos, endpos):
                    num_start_hits += 1
                    startsItem = True
                if item.endsItem(startpos, endpos):
                    num_end_hits += 1
                    endsItem = True
                if startsItem and endsItem:
                    s_num_start_end_hits += 1

        s_num_start_hits += num_start_hits
        s_num_end_hits += num_end_hits

        # I'm allowing one start and one end not to match starts and ends of exons
        if (num_hits == num_partial_alignements) and (
                num_start_hits + num_end_hits >=
                2 * num_partial_alignements - 2):
            s_num_good_alignments += 1
        # else:
        #     if num_hits > 0:
        #         import pdb
        #         pdb.set_trace()

        if whole_alignment_hit:
            s_whole_alignment_hits += 1
        else:
            s_whole_alignment_misses += 1

    if printMap:
        mapfile.close()

    # Writting unmapped query names to a file, if so specified
    if split_qnames:
        with open(filename_unmapped, "w+") as file_unmapped:
            file_unmapped.write(report.get_unmapped_names())
            file_unmapped.close()

    # Printing out results : NEW
    # Variables names matching RNA benchmark paper
    sys.stdout.write("\n\nAnalysis results:")
    sys.stdout.write("\nOriginal Samlines: %d" % report.num_alignments)
    sys.stdout.write(
        "\nUsable whole alignments (with valid CIGAR string): %d" %
        len(all_sam_lines))
    sys.stdout.write("\nAnnotations: %d" % len(annotation_dict))
    sys.stdout.write("\nMultiexon genes: %d" % s_num_multiexon_genes)

    sys.stdout.write("\nNumber of exon start hits: %d" % s_num_start_hits)
    sys.stdout.write("\nNumber of exon end hits: %d" % s_num_end_hits)
    sys.stdout.write("\nNumber of exon start and end hits: %d" %
                     s_num_start_end_hits)
    sys.stdout.write("\nNumber of good whole alignments: %d" %
                     s_num_good_alignments)
    sys.stdout.write(
        "\nNumber of alignments mapped to an incorrect chromosome: %d" %
        s_num_badchrom_alignments)

    sys.stdout.write("\nMAF: Correct alignment: %d" % s_maf_good_alignments)
    sys.stdout.write("\nMAF: Hit all parts: %d" % s_maf_hit_all_parts)
    sys.stdout.write("\nMAF: Hit at least one part: %d" % s_maf_hit_one_part)
    sys.stdout.write("\nMAF: Equals at least one part: %d" % s_maf_eq_one_part)

    sys.stdout.write("\nMAF: Number of split reads: %d" % s_maf_split_reads)
    sys.stdout.write("\nMAF: Correct alignment, SPLIT read: %d" %
                     s_maf_good_split_alignments)
    sys.stdout.write("\nMAF: Hit all parts, SPLIT read: %d" %
                     s_maf_split_hit_all_parts)
    sys.stdout.write("\nMAF: Hit at least one part, SPLIT read: %d" %
                     s_maf_split_hit_one_part)
    sys.stdout.write("\nMAF: Equals at least one part, SPLIT read: %d" %
                     s_maf_split_eq_one_part)

    sys.stdout.write("\nMAF: Partial alignment that misses: %d" %
                     s_maf_miss_alignment)
    sys.stdout.write("\nMAF: More alignments than expected: %d" %
                     s_maf_too_many_alignments)
    sys.stdout.write("\nMAF: Multihit parts (fragmented) alignments: %d" %
                     s_maf_multihit_parts)

    sys.stdout.write("\nDone!\n")

    # Closing file with names
    if split_qnames:
        file_correct.close()
        file_hitall.close()
        file_hitone.close()
        file_bad.close()
Exemple #43
0
def eval_contigs(ref_path, contig_path, temp_folder, generate_kmer_spectrum=False, silent=False):
	if (not os.path.exists(temp_folder)):
		os.makedirs(temp_folder);

	[headers_contigs, seqs_contigs, quals_contigs] = fastqparser.read_fastq(contig_path);
	[headers_ref, seqs_ref, quals_ref] = fastqparser.read_fastq(ref_path);

	ref_hash = hash_headers(headers_ref);
	contig_hash = hash_headers(headers_contigs);

	avg_accuracy_overall = 0.0;
	avg_id_overall = 0.0;
	num_valid_contigs = 0;

	single_contig_path = '%s/singlecontig.fasta' % (temp_folder);
	for i in xrange(0, len(seqs_contigs)):
		contig_name = headers_contigs[i].split()[0];
		contig_seq = seqs_contigs[i];

		fp_contig = open(single_contig_path, 'w');
		fp_contig.write('>%s\n%s\n' % (contig_name, seqs_contigs[i]));
		fp_contig.close();

		### Run nucmer to align the contig to the reference, also, filter the delta file and generate alignment coordinates.
		nucmer_out_prefix = '%s/nucmer' % (temp_folder);
		log('Running MUMmer on contig %d / %d: "%s"' % ((i + 1), len(seqs_contigs), contig_name), sys.stderr, silent=silent);
		log('Contig length: %d' % (len(contig_seq)), sys.stderr, silent=silent);
		command = '%s --maxmatch --extend -p %s %s %s; delta-filter -r -q %s.delta > %s.filt.delta; show-coords -r -c %s.filt.delta > %s.filt.coords' % \
					(NUCMER_PATH, nucmer_out_prefix, ref_path, single_contig_path, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix, nucmer_out_prefix);
		[rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command, silent=True);

		### Load the coordinates.
		log('Parsing the coords file.', sys.stderr, silent=silent);
		# fp = open('/home/isovic/work/eclipse-workspace/git/consise/temp2-mummer/test-data/out/nucmer.coords2', 'r');
		coords_path = '%s.filt.coords' % (nucmer_out_prefix);
		fp = open(coords_path, 'r');
		lines = fp.readlines();
		fp.close();

		frags = parse_coords_lines(lines, contig_name, seqs_ref, ref_hash, seqs_contigs, contig_hash);

		avg_accuracy_contig = 0.0;
		avg_id_contig = 0.0;

		log('Running Edlib to determine the edit distance for each fragment...', sys.stderr, silent=silent);
		debug_frags_path = '%s/frags.%d.csv' % (temp_folder, i);
		fp_frags = open(debug_frags_path, 'w');
		fp_frags.write('frag_or_summary\trstart\trend\tqstart\tqend\tfwd\trname\tqname\tidentity\trlen\tqlen\tedit_dist\tnormalized_edit_dist\taccuracy\n')
		for frag in frags:
			# print frag;

			[rstart, rend, qstart, qend, fwd, rname, qname, identity] = frag;
			ref_seq = seqs_ref[ref_hash[rname]];
			[nw_ref, nw_contig] = extract_seqs_for_edlib(ref_seq, contig_seq, rstart, rend, qstart, qend);

			temp_suffix = '.%d' % (i);
			nw_ref_path = '%s/nw-ref%s.fasta' % (temp_folder, temp_suffix);
			nw_contig_path = '%s/nw-contig%s.fasta' % (temp_folder, temp_suffix);

			fp_nw_ref = open(nw_ref_path, 'w');
			fp_nw_contig = open(nw_contig_path, 'w');
			fp_nw_ref.write('>%s\n%s\n' % (rname, nw_ref));
			fp_nw_contig.write('>%s\n%s\n' % (qname, nw_contig));
			fp_nw_ref.close();
			fp_nw_contig.close();

			command = '%s %s %s -m NW' % (EDLIB_PATH, nw_contig_path, nw_ref_path);
			[rc, rstdout, rstderr] = execute_command_with_ret(DRY_RUN, command, silent=True);
			scores = parse_edlib_scores(rstdout);
			if (len(scores) == 0):
				log('ERROR: len(scores) == 0!\nreturn code: %d\nrstdout:\n%s' % (rc, rstdout), sys.stderr);
				continue;
			# sys.stderr.write('Final edit distance: %d\n' % (scores[0]));

			normalized_edit_dist = float(scores[0]) / float(abs(qend - qstart + 1));
			accuracy = (1.0 - normalized_edit_dist);

			frag.append(abs(rend - rstart + 1));
			frag.append(abs(qend - qstart + 1));
			frag.append(scores[0]);
			frag.append(100.0 * normalized_edit_dist);
			frag.append(100.0 * accuracy);
			# print frag;

			avg_accuracy_contig += accuracy;
			avg_id_contig += frag[7];

			fp_frags.write('f\t' + '\t'.join([str(val) for val in frag]) + '\n');

		if (len(frags) > 0):
			avg_accuracy_contig /= float(len(frags));
			avg_id_contig /= float(len(frags));
			log('Average ID for contig "%s": %f%%' % (contig_name, avg_id_contig), sys.stderr, silent=silent);
			log('Average accuracy for contig "%s": %f%%' % (contig_name, 100.0*avg_accuracy_contig), sys.stderr, silent=silent);
			log('', sys.stderr, silent=silent);

			avg_accuracy_overall += avg_accuracy_contig;
			avg_id_overall += avg_id_contig;
			num_valid_contigs += 1;
		else:
			log('ERROR: There are no frags for contig "%s"! Continuing, will not be taken into account.' % (contig_name), sys.stderr, silent=silent);

		# fp_frags.write('s\t%s\t%f\t%f\n' % (rname, avg_id_contig, 100.0*avg_accuracy_contig))
		fp_frags.write('s\t0\t0\t0\t0\t-\t%s\t-\t%f\t0\t0\t0\t0\t%f\n' % (rname, avg_id_contig, 100.0*avg_accuracy_contig))
		fp_frags.close();

	if (num_valid_contigs > 0):
		avg_accuracy_overall /= float(num_valid_contigs);
		avg_id_overall /= float(num_valid_contigs);
	else:
		log('ERROR: There are no valid contigs in file "%s"! None of the contigs had valid MUMmer alignments.\n' % (contig_path), sys.stderr, silent=silent);



	log('Draft assembly: "%s"' % (contig_path), sys.stderr, silent=silent);
	log('Overall average ID for the draft assembly: %f%%' % (avg_id_overall), sys.stderr, silent=silent);
	log('Overall average accuracy for the draft assembly: %f%%' % (100.0*avg_accuracy_overall), sys.stderr, silent=silent);
	log('', sys.stderr, silent=silent);

	sys.stdout.write('================= Summary ===================\n');
	sys.stdout.write('Draft assembly: "%s"\n' % (contig_path));
	sys.stdout.write('Overall average ID for the draft assembly: %f%%\n' % (avg_id_overall));
	sys.stdout.write('Overall average accuracy for the draft assembly: %f%%\n' % (100.0*avg_accuracy_overall));
	sys.stdout.write('=============================================\n\n');
Exemple #44
0
def processData(datafolder, resultfile, annotationfile):

    # Loading results SAM file
    report = EvalReport(ReportType.FASTA_REPORT)  # not needed
    paramdict = {}

    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile,
                                                    paramdict,
                                                    report,
                                                    BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    s_num_multiexon_genes = 0

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.genename in annotation_dict:
            sys.stderr.write(
                '\nWARNING: anotation with name %s already in the dictionary!'
                % annotation.genename)
        else:
            annotation_dict[annotation.genename] = annotation
        if len(annotation.items) > 1:
            s_num_multiexon_genes += 1

    # Statistical information for evaluating the qualitiy of mapping
    s_gene_hits = 0
    s_gene_misses = 0
    s_whole_alignment_hits = 0
    s_whole_alignment_misses = 0
    s_partial_alignment_hits = 0
    s_partial_alignment_misses = 0
    s_num_start_hits = 0
    s_num_end_hits = 0
    s_num_start_end_hits = 0

    s_num_fw_strand = 0
    s_num_rv_strand = 0

    s_num_split_alignment = 0
    s_num_oversplit_alignment = 0  # Alignments that have more parts than exons

    s_num_good_alignments = 0

    s_num_badchrom_alignments = 0

    s_maf_suspicious_alignments = 0
    s_maf_bad_alignments = 0
    s_maf_good_alignments = 0

    s_maf_split_reads = 0
    s_maf_good_split_alignments = 0
    s_maf_bad_split_alignments = 0

    s_maf_hit_all_parts = 0
    s_maf_hit_one_part = 0
    s_maf_eq_one_part = 0
    s_maf_multihit_parts = 0

    s_maf_split_hit_all_parts = 0
    s_maf_split_hit_one_part = 0
    s_maf_split_eq_one_part = 0

    s_num_potential_bad_strand = 0

    # allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY       # Allowing some shift in positions
    # Setting allowed inaccuracy
    allowed_inacc = 5

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        isSplitAlignment = False
        if len(samline_list) > 1:
            s_num_split_alignment += 1
            isSplitAlignment = True

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]

        # Due to error in data preparation, have to make some extra processing
        if simQName[:6] == 'SimG2_':
            simQName = simQName[6:]


#        if simFolderKey == 'SimG1':
#            simFileSuffix = 'g1'
#        elif simFolderKey == 'SimG2':
#            simFileSuffix = 'g2'
#        elif simFolderKey == 'SimG3':
#            simFileSuffix = 'g3'
#        else:
#            simFileSuffix = 'sd'

        simFileSuffix = 'sd'

        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception(
                'Invalid simulated query name in results file (%s)!' %
                simQName)

        simQLetter = simQName[0]  # Should always be S

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 <> -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simQNumber = int(simQName[pos + 1:])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Reference file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simSeqFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        if len(samline_list) > len(annotation.items):
            # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items)))
            s_num_oversplit_alignment += 1

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            raise Exception('ERROR: could not find query %s in maf file %s' %
                            (qname, simMafFileName))

        # Calculating expected partial alignmetns from MAF and annotations

        # Saving "maf_length" to be able to check it later
        t_maf_length = maf_length

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() < maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            # try:
            #     start = annotation.items[i].start + maf_startpos
            #     end = annotation.items[i].end
            # except Exception:
            #     import pdb
            #     pdb.set_trace()
            #if not start < end:
            #    import pdb
            #    pdb.set_trace()
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end
            length = end - start + 1
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parthitmap = {(i + 1): 0 for i in xrange(numparts)}
        parteqmap = {(i + 1): 0 for i in xrange(numparts)}

        isSplitRead = False
        if len(expected_partial_alignments) > 1:
            s_maf_split_reads += 1
            isSplitRead = True

        if RNAseqEval.getChromName(
                samline_list[0].rname) != RNAseqEval.getChromName(
                    annotation.seqname):
            # import pdb
            # pdb.set_trace()
            s_num_badchrom_alignments += 1
        else:
            if len(samline_list) != len(expected_partial_alignments):
                # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname)
                s_maf_suspicious_alignments += 1
            # import pdb
            # pdb.set_trace()

            good_alignment = True
            k = 0
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                sl_endpos = sl_startpos + reflength

                # Comparing a samline to the corresponding expected partial alignment
                if k < len(expected_partial_alignments):
                    expected_alignement = expected_partial_alignments[k]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]
                    if abs(sl_startpos - maf_startpos) > allowed_inacc or abs(
                            sl_endpos - maf_endpos) > allowed_inacc:
                        good_alignment = False
                else:
                    good_alignment = False
                k += 1

                # Comparing a samline to all expected partial alignments
                for i in xrange(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if interval_equals((sl_startpos, sl_endpos),
                                       (maf_startpos, maf_endpos),
                                       allowed_inacc):
                        parteqmap[i + 1] += 1
                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc):
                        parthitmap[i + 1] += 1

            if good_alignment:
                s_maf_good_alignments += 1
                if isSplitRead:
                    s_maf_good_split_alignments += 1
            else:
                # import pdb
                # pdb.set_trace()
                s_maf_bad_alignments += 1
                if isSplitRead:
                    s_maf_bad_split_alignments += 1
                # TODO: check which alignments are bad and why
                # If the choromosome is different its obviously a bad alignment
                if RNAseqEval.getChromName(
                        samline.rname) == RNAseqEval.getChromName(
                            annotation.seqname):
                    # import pdb
                    # pdb.set_trace()
                    pass
                else:
                    s_num_badchrom_alignments += 1

        # Analyzing parthitmap and parteqmap
        oneHit = False
        allHits = True
        oneEq = False
        multiHit = False
        for i in xrange(numparts):
            if parthitmap[i + 1] > 0:
                oneHit = True
            if parthitmap[i + 1] == 0:
                allHits = False
            if parthitmap[i + 1] > 1:
                multiHit = True
            if parteqmap[i + 1] > 0:
                oneEq = True

        if oneHit:
            s_maf_hit_one_part += 1
            if isSplitRead:
                s_maf_split_hit_one_part += 1
        if allHits:
            s_maf_hit_all_parts += 1
            if isSplitRead:
                s_maf_split_hit_all_parts += 1
                #import pdb
                #pdb.set_trace()
        if oneEq:
            s_maf_eq_one_part += 1
            if isSplitRead:
                s_maf_split_eq_one_part += 1
        if multiHit:
            s_maf_multihit_parts += 1

        num_start_hits = 0
        num_end_hits = 0
        num_hits = 0

        num_partial_alignements = len(samline_list)
        whole_alignment_hit = False
        for samline in samline_list:
            startpos = samline.pos - 1
            reflength = samline.CalcReferenceLengthFromCigar()
            endpos = startpos + reflength

            if samline.flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
                s_num_fw_strand += 1
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
                s_num_rv_strand += 1

            chromname = RNAseqEval.getChromName(samline.rname)

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname
            ) and readstrand != annotation.strand and annotation.overlapsGene(
                    startpos, endpos):
                s_num_potential_bad_strand += 1

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname) and annotation.overlapsGene(
                        startpos, endpos) and (not P_CHECK_STRAND or readstrand
                                               == annotation.strand):
                whole_alignment_hit = True
                s_partial_alignment_hits += 1
            else:
                s_partial_alignment_misses += 1

            # Checking how well partial alignments match exons
            startsItem = False
            endsItem = False
            for item in annotation.items:
                if item.overlapsItem(startpos, endpos):
                    num_hits += 1
                if item.startsItem(startpos, endpos):
                    num_start_hits += 1
                    startsItem = True
                if item.endsItem(startpos, endpos):
                    num_end_hits += 1
                    endsItem = True
                if startsItem and endsItem:
                    s_num_start_end_hits += 1

        s_num_start_hits += num_start_hits
        s_num_end_hits += num_end_hits

        # I'm allowing one start and one end not to match starts and ends of exons
        if (num_hits == num_partial_alignements) and (
                num_start_hits + num_end_hits >=
                2 * num_partial_alignements - 2):
            s_num_good_alignments += 1
        # else:
        #     if num_hits > 0:
        #         import pdb
        #         pdb.set_trace()

        if whole_alignment_hit:
            s_whole_alignment_hits += 1
        else:
            s_whole_alignment_misses += 1

    # Printing out results : NEW
    # Variables names matching RNA benchmark paper
    sys.stdout.write('\n\nAnalysis results:')
    sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments)
    sys.stdout.write(
        '\nUsable whole alignments (with valid CIGAR string): %d' %
        len(all_sam_lines))
    sys.stdout.write('\nAnnotations: %d' % len(annotation_dict))
    sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes)

    sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits)
    sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits)
    sys.stdout.write('\nNumber of exon start and end hits: %d' %
                     s_num_start_end_hits)
    sys.stdout.write('\nNumber of good whole alignments: %d' %
                     s_num_good_alignments)

    sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments)
    sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part)

    sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads)
    sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' %
                     s_maf_good_split_alignments)
    sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' %
                     s_maf_split_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' %
                     s_maf_split_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' %
                     s_maf_split_eq_one_part)

    sys.stdout.write('\nDone!\n')
Exemple #45
0
def run_poa_sequentially_v2(seq_path, out_consensus_file):
	temp_subseq_file = '%s/tmp.subseq.fasta' % (os.path.dirname(out_consensus_file));
	temp_msa_file = '%s/tmp.subseq.fasta.pir' % (os.path.dirname(out_consensus_file));
	# out_consensus_file = '%s/consensus-poa.fasta' % (os.path.dirname(seq_path));
	out_consensus_file_chunks = '%s/tmp.consensus.chunks.fasta' % (os.path.dirname(out_consensus_file));

	fp_out_all = open(out_consensus_file, 'w');
	fp_out_chunks = open(out_consensus_file_chunks, 'w');

	timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime());
	fp_out_all.write('>Consensus_with_POA all %s\n' % (timestamp));

	print 'seq_path = "%s"' % (seq_path);

	[ret_string, num_seqs, total_seq_len, average_seq_len, max_seq_len] = fastqparser.count_seq_length(seq_path);

	window_len = 5000;
	# window_len = 1000;
	# window_len = max_seq_len;

	start_coord = 0;
	while (start_coord < max_seq_len):
		end_coord = start_coord + window_len;
		if (end_coord > (max_seq_len - window_len)):
			end_coord = max_seq_len;

		sys.stderr.write('Window: start = %d, end = %d\n' % (start_coord, end_coord));
		execute_command('%s/fastqfilter.py subseqs %s %d %d %s' % (SAMSCRIPTS_PATH, seq_path, start_coord, end_coord, temp_subseq_file));

		# if (start_coord == 0 or end_coord == max_seq_len):
		# 	execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
			# execute_command('%s/poaV2/poa -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
		# else:
		execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/blosum80.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));
			# execute_command('%s/poaV2/poa -do_global -do_progressive -read_fasta %s -pir %s %s/poaV2/all1.mat' % (TOOLS_PATH, temp_subseq_file, temp_msa_file, TOOLS_PATH));

		timestamp = strftime("%Y/%m/%d %H:%M:%S", gmtime());
		fp_out_chunks.write('>Consensus_with_POA %d-%d %s\n' % (start_coord, end_coord, timestamp));
		[headers, seqs, quals] = fastqparser.read_fastq(temp_msa_file);

		cons_seq = '';
		for i in xrange(0, len(seqs[0])):
			base_counts = {'A': 0, 'C': 0, 'T': 0, 'G': 0, '.': 0};
			for j in xrange(0, len(seqs)):
				base_counts[seqs[j][i]] += 1;
			sorted_base_counts = sorted(base_counts.items(), key=operator.itemgetter(1));
			# print sorted_base_counts;
			if (sorted_base_counts[-1][0] != '.'):
				cons_seq += sorted_base_counts[-1][0]

		fp_out_all.write('%s' % (cons_seq));
		fp_out_chunks.write('%s\n' % (cons_seq));

		# # print temp_subseq_file;
		# # print headers;
		# i = 0;
		# while (i < len(headers)):
		# 	if ('consensus' in headers[i]):
		# 		# print seqs[i];
		# 		# print seqs[i].replace('.', '');
		# 		chunk_seq = seqs[i].replace('.', '');
		# 		fp_out_all.write('%s' % (chunk_seq));
		# 		fp_out_chunks.write('%s\n' % (chunk_seq));
		# 		break;
		# 	i += 1;

		# break;
		start_coord = end_coord;

	fp_out_all.write('\n');
	fp_out_all.close();
	fp_out_chunks.close();
def processData(datafolder, resultfile, annotationfile, paramdict):

    split_qnames = False
    filename = ''
    if '--split-qnames' in paramdict:
        split_qnames = True
        filename = paramdict['--split-qnames'][0]

    filename_correct = filename + '_correct.names'
    filename_hitall = filename + '_hitall.names'
    filename_hitone = filename + '_hitone.names'
    filename_bad = filename + '_incorrect.names'
    filename_unmapped = filename + '_unmapped.names'

    printMap = False
    filename_mapping = ''
    if '--print_mapping' in paramdict:
        filename_mapping = paramdict['--print_mapping'][0]
        printMap = True

    file_correct = None
    file_hitall = None
    file_hitone = None
    file_bad = None
    file_unmapped = None
    folder = os.getcwd()

    # If splittng qnames into files, have to open files first
    if split_qnames:
        file_correct = open(os.path.join(folder, filename_correct), 'w+')
        file_hitall = open(os.path.join(folder, filename_hitall), 'w+')
        file_hitone = open(os.path.join(folder, filename_hitone), 'w+')
        file_bad = open(os.path.join(folder, filename_bad), 'w+')

    # Loading results SAM file
    report = EvalReport(ReportType.FASTA_REPORT)    # not really needed, used for unmapped query names
    # Have to preserve the paramdict
    # paramdict = {}

    sys.stderr.write('\n(%s) Loading and processing SAM file with mappings ... ' % datetime.now().time().isoformat())
    all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile, paramdict, report, BBMapFormat = True)


    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    s_num_multiexon_genes = 0

    mapfile = None
    if printMap:
        mapfile = open(filename_mapping, 'w+')

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.genename in annotation_dict:
            sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename)
        else:
            annotation_dict[annotation.genename] = annotation
        if len(annotation.items) > 1:
            s_num_multiexon_genes += 1


    # Statistical information for evaluating the qualitiy of mapping
    s_gene_hits = 0
    s_gene_misses = 0
    s_whole_alignment_hits = 0
    s_whole_alignment_misses = 0
    s_partial_alignment_hits = 0
    s_partial_alignment_misses = 0
    s_num_start_hits = 0
    s_num_end_hits = 0
    s_num_start_end_hits = 0

    s_num_fw_strand = 0
    s_num_rv_strand = 0

    s_num_split_alignment = 0
    s_num_oversplit_alignment = 0       # Alignments that have more parts than exons

    s_num_good_alignments = 0

    s_num_badchrom_alignments = 0

    s_maf_suspicious_alignments = 0
    s_maf_bad_alignments = 0
    s_maf_good_alignments = 0

    s_maf_split_reads = 0
    s_maf_good_split_alignments = 0
    s_maf_bad_split_alignments = 0

    s_maf_hit_all_parts = 0
    s_maf_hit_one_part = 0
    s_maf_eq_one_part = 0
    s_maf_multihit_parts = 0

    s_maf_split_hit_all_parts = 0
    s_maf_split_hit_one_part = 0
    s_maf_split_eq_one_part = 0

    s_maf_miss_alignment = 0
    s_maf_too_many_alignments = 0

    s_num_potential_bad_strand = 0

    allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY       # Allowing some shift in positions
    min_overlap = Annotation_formats.DEFAULT_MINIMUM_OVERLAP       		# Minimum overlap that is considered

    # Setting allowed_inaccuracy from parameters
    if '--allowed_inacc' in paramdict:
        allowed_inacc = int(paramdict['--allowed_inacc'][0])
    elif '-ai' in paramdict:
        allowed_inacc = int(paramdict['-ai'][0])

    # Setting minimum overlap from parameters
    if '--allowed_inacc' in paramdict:
        min_overlap = int(paramdict['--allowed_inacc'][0])
    elif '-mo' in paramdict:
        min_overlap = int(paramdict['-mo'][0])

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        isSplitAlignment = False
        if len(samline_list) > 1:
            s_num_split_alignment += 1
            isSplitAlignment = True

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write('\nWARNING: two samlines in the same list with different query names (%s/%s)' % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' % simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos+1:]

        # Due to error in data preparation, have to make some extra processing
        if simQName[:6] == 'SimG2_':
            simQName = simQName[6:]


#        if simFolderKey == 'SimG1':
#            simFileSuffix = 'g1'
#        elif simFolderKey == 'SimG2':
#            simFileSuffix = 'g2'
#        elif simFolderKey == 'SimG3':
#            simFileSuffix = 'g3'
#        else:
#            simFileSuffix = 'sd'

        simFileSuffix = 'sd'


        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception('Invalid simulated query name in results file (%s)!' % simQName)

        simQLetter = simQName[0]       # Should always be S

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 <> -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simQNumber = int(simQName[pos+1:])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception('Reference file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simSeqFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception('Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception('Sequence alignment (MAF) for simulated read %s does not exist!' % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        annotation = annotation_dict[simGeneName]       # Getting the correct annotation

        if len(samline_list) > len(annotation.items):
            # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items)))
            s_num_oversplit_alignment += 1

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        maf_strand = '0'
        maf_reflen = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':              # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_strand = elements[4]
                        maf_reflen = int(elements[5])
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            raise Exception('ERROR: could not find query %s in maf file %s' % (qname, simMafFileName))

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen - maf_length - maf_startpos

        # Saving "maf_length" and "maf_startpos" to be able to check it later
        t_maf_length = maf_length
        t_maf_startpos = maf_startpos

        # Calculating expected partial alignmetns from MAF and annotations

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() < maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end
            
            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0

        # import pdb
        # pdb.set_trace()

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parthitmap = {(i+1):0 for i in xrange(numparts)}
        parteqmap = {(i+1):0 for i in xrange(numparts)}

        isSplitRead = False
        if len(expected_partial_alignments) > 1:
            s_maf_split_reads += 1
            isSplitRead = True

        oneHit = False
        allHits = False
        oneEq = False
        multiHit = False
        good_alignment = False
        has_miss_alignments = False

        if RNAseqEval.getChromName(samline_list[0].rname) != RNAseqEval.getChromName(annotation.seqname):
            # import pdb
            # pdb.set_trace()
            s_num_badchrom_alignments += 1
        else:
            if len(samline_list) != len(expected_partial_alignments):
            # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname)
                s_maf_suspicious_alignments += 1
            # import pdb
            # pdb.set_trace()

            good_alignment = True
            k = 0
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                sl_endpos = sl_startpos + reflength

                # Comparing a samline to the corresponding expected partial alignment
                if k < len(expected_partial_alignments):
                    expected_alignement = expected_partial_alignments[k]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]
                    if abs(sl_startpos - maf_startpos) > allowed_inacc or abs(sl_endpos - maf_endpos) > allowed_inacc:
                        good_alignment = False
                else:
                    good_alignment = False
                k += 1

                # Comparing a samline to all expected partial alignments
                for i in xrange(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if interval_equals((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap):
                        parteqmap[i+1] += 1
                    if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap):
                        parthitmap[i+1] += 1

            has_miss_alignments = False
            for expected_alignement in expected_partial_alignments:
                maf_startpos = expected_alignement[0]
                maf_endpos = expected_alignement[1]
                overlap = False
                for samline in samline_list:
                    sl_startpos = samline.pos
                    reflength = samline.CalcReferenceLengthFromCigar()
                    sl_endpos = sl_startpos + reflength
                    if interval_overlaps((sl_startpos, sl_endpos), (maf_startpos, maf_endpos), allowed_inacc, min_overlap):
                        overlap = True
                if not overlap:
                    has_miss_alignments = True
                    break

            if len(samline_list) < len(expected_partial_alignments):
                s_maf_too_many_alignments += 1

            # Testing the evaluation process
            # import pdb
            # pdb.set_trace()
            if len(samline_list) <> len(expected_partial_alignments):
                good_alignment = False

            if good_alignment:
                s_maf_good_alignments += 1

                # Writting qnames to files
                if split_qnames:
                    file_correct.write(samline_list[0].qname + '\n')

                if isSplitRead:
                    s_maf_good_split_alignments += 1
            else:
                # import pdb
                # pdb.set_trace()
                s_maf_bad_alignments += 1
                if isSplitRead:
                    s_maf_bad_split_alignments += 1
                # TODO: check which alignments are bad and why
                # If the choromosome is different its obviously a bad alignment
                if RNAseqEval.getChromName(samline.rname) == RNAseqEval.getChromName(annotation.seqname):
                    # import pdb
                    # pdb.set_trace()
                    pass
                else:
                    s_num_badchrom_alignments += 1


            # Analyzing parthitmap and parteqmap
            oneHit = False
            allHits = True
            oneEq = False
            multiHit = False
            for i in xrange(numparts):
                if parthitmap[i+1] > 0:
                    oneHit = True
                if parthitmap[i+1] == 0:
                    allHits = False
                if parthitmap[i+1] > 1:
                    multiHit = True
                if parteqmap[i+1] > 0:
                    oneEq = True

        if printMap:
            status = 'INCORRECT'
            if good_alignment:
                status = 'CORRECT'
            elif allHits:
                status = 'HITALL'
            elif oneHit:
                status = 'HITONE'
            mapfile.write('QNAME: %s, STATUS: %s\n\n' % (samline_list[0].qname, status))
            mapfile.write('EXPECTED (%s, %s):\t' % (RNAseqEval.getChromName(annotation.seqname), annotation.strand))
            for epa in expected_partial_alignments:
                mapfile.write('(%d, %d)\t' % (epa[0], epa[1]))
            mapfile.write('\n')
            if samline_list[0].flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
            mapfile.write('ACTUAL   (%s, %s):\t' % (RNAseqEval.getChromName(samline_list[0].rname), readstrand))
            for samline in samline_list:
                mapfile.write('(%d, %d)\t' % (samline.pos, samline.pos + samline.CalcReferenceLengthFromCigar()))
            mapfile.write('\n\n')


        if oneHit:
            s_maf_hit_one_part += 1
            if isSplitRead:
                s_maf_split_hit_one_part += 1

            # Writting qnames to files
            if split_qnames:
                file_hitone.write(samline_list[0].qname + '\n')

            if not allHits:
                if '--debug' in paramdict:
                    import pdb
                    pdb.set_trace()

            # Misses are calculated only for alignments that have at least one hit
            if has_miss_alignments:
                s_maf_miss_alignment += 1

        else:
            # Writting qnames to files
            if split_qnames:
                file_bad.write(samline_list[0].qname + '\n')

            # if '--debug' in paramdict:
            #     import pdb
            #     pdb.set_trace()

        if allHits:
            s_maf_hit_all_parts += 1
            if isSplitRead:
                s_maf_split_hit_all_parts += 1

            # Writting qnames to files
            if split_qnames:
                file_hitall.write(samline_list[0].qname + '\n')

        # Sanity check
        if '--debug' in paramdict and good_alignment and not allHits:
            import pdb
            pdb.set_trace()
            pass

        if oneEq:
            s_maf_eq_one_part += 1
            if isSplitRead:
                s_maf_split_eq_one_part += 1
        if multiHit:
            s_maf_multihit_parts += 1

        num_start_hits = 0
        num_end_hits = 0
        num_hits = 0

        num_partial_alignements = len(samline_list)
        whole_alignment_hit = False
        for samline in samline_list:
            startpos = samline.pos - 1
            reflength = samline.CalcReferenceLengthFromCigar()
            endpos = startpos + reflength

            if samline.flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
                s_num_fw_strand += 1
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
                s_num_rv_strand += 1

            chromname = RNAseqEval.getChromName(samline.rname)

            if chromname == RNAseqEval.getChromName(annotation.seqname) and readstrand != annotation.strand and annotation.overlapsGene(startpos, endpos):
                s_num_potential_bad_strand += 1

            if chromname == RNAseqEval.getChromName(annotation.seqname) and annotation.overlapsGene(startpos, endpos) and (not P_CHECK_STRAND or readstrand == annotation.strand):
                whole_alignment_hit = True
                s_partial_alignment_hits += 1
            else:
                s_partial_alignment_misses +=1

            # Checking how well partial alignments match exons
            startsItem = False
            endsItem = False
            for item in annotation.items:
                if item.overlapsItem(startpos, endpos):
                    num_hits += 1
                if item.startsItem(startpos, endpos):
                    num_start_hits += 1
                    startsItem = True
                if item.endsItem(startpos, endpos):
                    num_end_hits += 1
                    endsItem = True
                if startsItem and endsItem:
                    s_num_start_end_hits += 1

        s_num_start_hits += num_start_hits
        s_num_end_hits += num_end_hits

        # I'm allowing one start and one end not to match starts and ends of exons
        if (num_hits == num_partial_alignements) and (num_start_hits + num_end_hits >= 2*num_partial_alignements - 2) :
            s_num_good_alignments += 1
        # else:
        #     if num_hits > 0:
        #         import pdb
        #         pdb.set_trace()

        if whole_alignment_hit:
            s_whole_alignment_hits += 1
        else:
            s_whole_alignment_misses += 1

    if printMap:
        mapfile.close()

    # Writting unmapped query names to a file, if so specified
    if split_qnames:
        with open(filename_unmapped, 'w+') as file_unmapped:
            file_unmapped.write(report.get_unmapped_names())
            file_unmapped.close()

    # Printing out results : NEW
    # Variables names matching RNA benchmark paper
    sys.stdout.write('\n\nAnalysis results:')
    sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments)
    sys.stdout.write('\nUsable whole alignments (with valid CIGAR string): %d' % len(all_sam_lines))
    sys.stdout.write('\nAnnotations: %d' % len(annotation_dict))
    sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes)

    sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits)
    sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits)
    sys.stdout.write('\nNumber of exon start and end hits: %d' % s_num_start_end_hits)
    sys.stdout.write('\nNumber of good whole alignments: %d' % s_num_good_alignments)
    sys.stdout.write('\nNumber of alignments mapped to an incorrect chromosome: %d' % s_num_badchrom_alignments)

    sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments)
    sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part)

    sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads)
    sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' % s_maf_good_split_alignments)
    sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' % s_maf_split_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' % s_maf_split_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' % s_maf_split_eq_one_part)

    sys.stdout.write('\nMAF: Partial alignment that misses: %d' % s_maf_miss_alignment)
    sys.stdout.write('\nMAF: More alignments than expected: %d' % s_maf_too_many_alignments)
    sys.stdout.write('\nMAF: Multihit parts (fragmented) alignments: %d' % s_maf_multihit_parts)

    sys.stdout.write('\nDone!\n')

    # Closing file with names
    if split_qnames:
        file_correct.close()
        file_hitall.close()
        file_hitone.close()
        file_bad.close()
Exemple #47
0
def extract_alternate_contigs(single_contig_file, reads_file, out_alt_ctg_file, ref_file=''):
	### Generate file paths for some temporary files.
	path_aligns_basename = '%s/tmp.allreads' % (os.path.dirname(out_alt_ctg_file));
	path_aligns = '%s.sam' % (path_aligns_basename);
	path_aligns_sorted_basename = '%s.sorted' % (path_aligns_basename);
	path_aligns_sorted_sam = '%s.sam' % (path_aligns_sorted_basename);
	path_alt_contig_sams = '%s.altctgs.sam' % (path_aligns_basename);

	if (not os.path.exists(os.path.dirname(out_alt_ctg_file))):
		os.path.makedirs(os.path.dirname(out_alt_ctg_file));

	### Generate alignments.
	# execute_command('%s/graphmap/bin/Linux-x64/graphmap -a anchor -b 3 -r %s -d %s -o %s' % (TOOLS_PATH, single_contig_file, reads_file, path_aligns));
	# execute_command('samtools view -Sb %s | samtools sort - %s && samtools view -h %s.bam > %s' % (path_aligns, path_aligns_sorted_basename, path_aligns_sorted_basename, path_aligns_sorted_sam));

	[ctg_headers, ctg_seqs, ctg_quals] = fastqparser.read_fastq(single_contig_file);
	[headers, all_sam_lines] = utility_sam.LoadSAM(path_aligns_sorted_sam);

	sys.stderr.write('Number of lines in the original SAM file: %d\n' % (len(all_sam_lines)));

	sam_lines = [];
	for sam_line in all_sam_lines:
		if (sam_line.IsMapped() == False):
			continue;
		seq_len = len(sam_line.seq) - sam_line.clip_count_front - sam_line.clip_count_back;
		cigop_counts = sam_line.CountAlignmentOps();
		### Check if the CIGAR string is actually in the extended format.
		if ('M' in cigop_counts):
			sys.stderr.write('Warning: alignment does not contain the *extended* CIGAR format! Skipping alignment.\n');
			exit(1);
		else:
			matches = cigop_counts['='];
			errors = cigop_counts['X'] + cigop_counts['D'] + cigop_counts['I'];

		if (float(matches) / float(seq_len) >= 0.70 and float(errors) / float(seq_len) < 0.40):
			sam_lines.append(sam_line);

	sys.stderr.write('Number of filtered SAM lines (only mapped and with errors below threshold): %d\n' % (len(sam_lines)));

	fp_out_alt_ctg = open(out_alt_ctg_file, 'w');
	fp_out_alt_ctg_sams = open(path_alt_contig_sams, 'w');
	fp_out_alt_ctg_sams.write('\n'.join(headers) + '\n');

	### Find alternate contigs from alignments.
	sams_to_process = sam_lines;
	coverage = 0;
	while (coverage < 100 and len(sams_to_process) > 0):
		coverage += 1;
		print '---------------------------------------';
		print 'Coverage = %d' % (coverage);
		sys.stderr.write('Number of alignments in pool: %d\n' % (len(sams_to_process)));
		contig_sams = [];
		unused_sams = [];
		i = 0;
		candidate_read = i;
		contig_sams.append(sams_to_process[candidate_read]);
		# for candidate_read in xrange((i+1), len(sams_to_process)):
		start1 = sams_to_process[candidate_read].pos - 1;
		end1 = start1 + sams_to_process[candidate_read].CalcReferenceLengthFromCigar();
		print 'candidate: start = %d, end = %d' % (start1, end1);

		while ((candidate_read + 1) < len(sams_to_process)):
			max_overlap_len = 0;
			max_overlap_id = -1;
			# j = candidate_read + 1;
			# while (j < len(sams_to_process)):
			for j in xrange(candidate_read + 1, len(sams_to_process)):
				overlap_len = check_overlap(sams_to_process[candidate_read], sams_to_process[j], 0);
				if (overlap_len == 0):
					print 'break 1';
					print '  j = %d (in the range of %d to %d)' % (j, candidate_read + 1, len(sams_to_process));
					break;
				elif (overlap_len == -1 or overlap_len == -2):	### -1 is for contained sequences, and -2 is for overlaps which are below the threshold.
					# j += 1;
					continue;

				if (max_overlap_id == -1 or overlap_len >= max_overlap_len):
					max_overlap_len = overlap_len;
					max_overlap_id = j;
				# j += 1;

			if (max_overlap_id > 0):
				print '  starting read = %d' % (candidate_read);
				print '  candidate_read = %d' % (max_overlap_id);
				print '  max_overlap_len = %d' % (max_overlap_len);
				print '  unused overlapping reads: %d - %d' % ((candidate_read + 1), max_overlap_id);

				start1 = sams_to_process[max_overlap_id].pos - 1;
				end1 = start1 + sams_to_process[max_overlap_id].CalcReferenceLengthFromCigar();
				print '  candidate: start = %d, end = %d' % (start1, end1);
				unused_sams += sams_to_process[(candidate_read + 1):max_overlap_id];
				candidate_read = max_overlap_id;
				contig_sams.append(sams_to_process[candidate_read]);
			else:
				print 'break 2';
				break;

		print '  unused reads: %d - %d' % ((candidate_read + 1), len(sams_to_process));
		unused_sams += sams_to_process[(candidate_read + 1):len(sams_to_process)];
		sams_to_process = unused_sams + [];

		# if ((candidate_read + 1) == len(sam_lines)):
		# 	break;

		# i += 1;
		# max_overlap_len = 0;
		# max_overlap_id = -1;
		# while (i < len(sam_lines)):
		# 	overlap_len = check_overlap(sam_lines[candidate_read], sam_lines[i + 1]);
		# 	if ((i + 1) >= len(sam_lines) or overlap_len <= 0):
		# 		break;
		# 	else:
		# 		unused_sams.append(sam_lines[i]);
		# 		overlap_len = check_overlap(sam_lines[candidate_read], sam_lines[i]);
		# 		if (overlap_len >= max_overlap_len):
		# 			max_overlap_len = overlap_len;
		# 			max_overlap_id = i;
		# 	i += 1;
		# contig_sams.append(sam_lines[candidate_read]);
		# # candidate_read = i;
		# if (max_overlap_id > 0):
		# 	candidate_read = max_overlap_id;
		# else:
		# 	break;

		# i += 1;

		print '  after coverage %d:' % (coverage);
		print '    len(sams_to_process) = %d' % (len(sams_to_process));
		print '    len(contig_sams) = %d' % len(contig_sams);
		print '    len(unused_sams) = %d' % len(unused_sams);


		[new_contig, non_clipped_len, new_contig_cigar] = construct_contig_from_overlapping_sams(ctg_seqs, contig_sams);

		test_sam_line = utility_sam.SAMLine();
		test_sam_line.seq = new_contig;
		test_sam_line.cigar = new_contig_cigar;

		print 'test_sam_line.CalcReadLengthFromCigar() = %d' % (test_sam_line.CalcReadLengthFromCigar());
		print 'test_sam_line.CalcReferenceLengthFromCigar() = %d' % (test_sam_line.CalcReferenceLengthFromCigar());
		print 'len(test_sam_line.seq) = %d' % (len(test_sam_line.seq));

		print '*********************    len(new_contig) = %d, non_clipped_len = %d' % (len(new_contig), non_clipped_len);
		exit(1);

		if (float(non_clipped_len) < 0.85*float(len(ctg_seqs[0]))):
			# print 'Tu sam!';
			# exit(1);
			continue;
		else:
			print '++++++++++++++++++++++++++++++++++++++++';


		fp_out_alt_ctg.write('>%s %d\n' % (ctg_headers[0], coverage));
		fp_out_alt_ctg.write('%s\n' % (new_contig));

		for sam_line in contig_sams:
			fp_out_alt_ctg_sams.write(sam_line.original_line + '\n');
		fp_out_alt_ctg_sams.write('\n');

	fp_out_alt_ctg_sams.close();
	fp_out_alt_ctg.close();