def main(fasta_filename, csv_filename):
    d = LazyFastaReader(fasta_filename)
    pCS, orphans = read_seq_csv(csv_filename)

    # detect PCR chimeras from orphans
    chimeras = detect_PCR_chimeras(orphans, d)
    orphans = orphans.difference(chimeras)

    FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d)
    FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d)


    infof = open('preCluster.cluster_info.csv', 'w')
    infof.write("cluster,size\n")
    # write out a directory per preCluster cid in preCluster_out/<cid>
    # Liz note: right now, write out even directories with just 1 sequence
    # (we know they have "tucked" support, so can run Partial/Arrow on it)
    #singlef = open("preCluster_out.singles.fasta", 'w')
    for cid in pCS.S:
    #    if pCS.S[cid].size == 1:
    #        r = d[pCS.S[cid].members[0]]
    #        singlef.write(">{0}\n{1}\n".format(r.id, r.seq))
    #    else:
        #print >> sys.stderr, "cid", cid
        if True:
            dirname = os.path.join("preCluster_out", str(cid))
            os.makedirs(dirname)
            file = os.path.join(dirname, 'isoseq_flnc.fasta')
            FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d)
        infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members)))
        #print cid, len(pCS.S[cid].members)
    #singlef.close()
    infof.close()
Exemple #2
0
def chunk_collected_fasta_pickle(combined_fasta, combined_uc, combined_refs,
                                 num_chunks, chunk_prefix):
    d = LazyFastaReader(combined_fasta,
                        seqid_extraction=lambda x: x.split('/')[0])
    # the seqids in combined_fasta are b0_c0/2/1776, need to make b0_c0 also key

    uc = combined_uc
    refs = combined_refs

    keys = list(uc.keys())
    keys.sort()
    n = len(keys) / num_chunks + 1
    for i in range(num_chunks):
        _from = i * n
        _to = min(len(keys), (i + 1) * n)
        with open("{0}.chunk{1}.consensus.fasta".format(chunk_prefix, i),
                  'w') as f:
            for seqid in keys[_from:_to]:
                r = d[seqid]
                f.write(">{0}\n{1}\n".format(r.id, r.seq))
        with open("{0}.chunk{1}.pickle".format(chunk_prefix, i), 'w') as f:
            dump(
                {
                    'uc': dict((k, uc[k]) for k in keys[_from:_to]),
                    'refs': dict((k, refs[k]) for k in keys[_from:_to])
                }, f)
Exemple #3
0
def create_seed_n_batch_files(input='isoseq_flnc.fasta',
                              fasta_d=None,
                              seed_filename='seed0.fasta',
                              batch_pre='batch'):
    if fasta_d is None:
        fasta_d = LazyFastaReader(input)

    batch_files = []

    lens = [(r.id, len(r.seq)) for r in SeqIO.parse(open(input), 'fasta')]
    lens.sort(key=lambda x: x[1], reverse=True)

    n = len(lens)

    # start at 1% of the data
    starting_seed_index = n * 1 / 100
    good = [
        x[0] for x in lens[starting_seed_index:starting_seed_index +
                           NUM_SEQS_PER_BATCH]
    ]
    write_seqids_to_fasta(good, seed_filename, fasta_d)

    batch_index = 1
    starting_index = starting_seed_index + NUM_SEQS_PER_BATCH
    while starting_index < n:
        write_seqids_to_fasta([x[0] for x in lens[starting_index:starting_index+NUM_SEQS_PER_BATCH]], \
                              "{0}{1}.fasta".format(batch_pre, batch_index), fasta_d)
        starting_index += NUM_SEQS_PER_BATCH
        batch_index += 1
        batch_files.append("{0}{1}.fasta".format(batch_pre, batch_index))

    write_seqids_to_fasta([x[0] for x in lens[:starting_seed_index]],
                          "{0}{1}.fasta".format(batch_pre,
                                                batch_index), fasta_d)
    return batch_index + 1
Exemple #4
0
def write_select_seqs_to_fasta(fasta_filename,
                               seqids,
                               output_filename,
                               mode='w'):
    d = LazyFastaReader('isoseq_flnc.fasta')
    with open(output_filename, mode) as f:
        r = d[x]
        f.write(">{0}\n{1}\n".format(r.id, r.seq))
Exemple #5
0
def main(cpus, dun_make_bins=False, dun_use_partial=False, num_seqs_per_batch=100000, dun_cleanup_files=False):
    print "Indexing isoseq_flnc.fasta using LazyFastaReader..."
    d = LazyFastaReader('isoseq_flnc.fasta')

    print "Splitting input isoseq_flnc.fasta into seed/batches..."
    num_batchs = create_seed_n_batch_files(input='isoseq_flnc.fasta', fasta_d=d, seed_filename='seed0.fasta', batch_pre='batch', num_seqs_per_batch=num_seqs_per_batch)


    # step1. run minimap of seed0 against itself and process
    o = ar.run_minimap('seed0.fasta', 'seed0.fasta', cpus=cpus)
    seqids = set([r.id for r in SeqIO.parse(open('seed0.fasta'),'fasta')])
    pCS, orphans = sp.process_self_align_into_seed(o, seqids, MiniReader, dun_use_partial=dun_use_partial)
    # keep stats
    size_S, size_tucked, size_orphans = len(pCS.S), sum(v=='T' for v in pCS.seq_stat.itervalues()), len(orphans)
    print "seed 0 initial: S {0}, tucked {1}, orphans {2}".format(size_S, size_tucked, size_orphans)

    # write out seed1.S.fasta and seed1.orphans.fasta
    FileIO.write_preClusterSet_to_fasta(pCS, 'seed1.S.fasta', d)
    FileIO.write_seqids_to_fasta(orphans, 'seed1.orphans.fasta', d)
    # step 2a. minimap batch1 against seed1.S and process

    for i in xrange(1, num_batchs):
        pCS, orphans = add_batch(i, pCS, orphans, d, cpus=cpus, dun_use_partial=dun_use_partial)
        cleanup_precluster_intermediate_files(i)

    # detect PCR chimeras from orphans
    chimeras = detect_PCR_chimeras(orphans, d)
    orphans = orphans.difference(chimeras)

    FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d)
    FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d)


    tucked_seqids = []
    # dump pCS, orphans, chimeras to a pickle
    # can't dump yet --- since pCS is an object
    #with open('preCluster.output.pickle', 'w') as f:
    #    dump({'pCS': pCS, 'chimeras': chimeras, 'orphans': orphans}, f)
    # write CSV file
    with open('preCluster.output.csv', 'w') as f:
        f.write("seqid,stat\n")
        for x, stat in pCS.seq_stat.iteritems():
            if stat == 'T':
                f.write("{0},tucked\n".format(x))
                tucked_seqids.append(x)
            elif stat == 'M': f.write("{0},{1}\n".format(x, pCS.seq_map[x]))
        for x in orphans: f.write("{0},orphan\n".format(x))
        for x in chimeras: f.write("{0},chimera\n".format(x))

    # Liz: currently not using tucked...
    #FileIO.write_seqids_to_fasta(tucked_seqids, "preCluster_out.tucked.fasta", d)

    infof = open('preCluster.cluster_info.csv', 'w')
    infof.write("cluster,size\n")
    # write out a directory per preCluster cid in preCluster_out/<cid>
    # Liz note: right now, write out even directories with just 1 sequence
    # (we know they have "tucked" support, so can run Partial/Arrow on it)
    #singlef = open("preCluster_out.singles.fasta", 'w')
    for cid in pCS.S:
    #    if pCS.S[cid].size == 1:
    #        r = d[pCS.S[cid].members[0]]
    #        singlef.write(">{0}\n{1}\n".format(r.id, r.seq))
    #    else:
        if True:
            if not dun_make_bins:
                dirname = os.path.join("preCluster_out", str(cid))
                os.makedirs(dirname)
                file = os.path.join(dirname, 'isoseq_flnc.fasta')
                FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d)
            infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members)))
    #singlef.close()
    infof.close()

    if not dun_cleanup_files: # clean up all seed* and batch* files
        for file in glob.glob('batch*fasta*'):
            os.remove(file)
        for file in glob.glob('seed*fasta*'):
            os.remove(file)
Exemple #6
0
                    snps_filename),
                file=sys.stderr)
            sys.exit(-1)
        if not os.path.exists(filename):
            print("{0} does not exist! Abort.".format(filename),
                  file=sys.stderr)
            sys.exit(-1)
        snps_files.append(filename)

    if not os.path.exists(genome_filename):
        print("Genome file {0} does not exist!".format(genome_filename),
              file=sys.stderr)

    print("Reading genome file {0}....".format(genome_filename),
          file=sys.stderr)
    genome_d = LazyFastaReader(genome_filename)

    # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it
    keys = list(genome_d.keys())
    for k in keys:
        k2 = k.split('|')[0]
        if k2 != k and k2 not in keys:
            genome_d.d[k2] = genome_d.d[k]
            print(
                "Detected | string in chromosome ID, stripping {0} to {1}....".
                format(k, k2),
                file=sys.stderr)
    print("Finished reading genome.", file=sys.stderr)

    for snp_file in snps_files:
        assert snp_file.endswith('.snps')
Exemple #7
0
def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             fusion_candidate_ranges,
             is_fq=False):
    """
    For each group, select the representative record
    Always pick the longest one!
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print("Picking representative sequence for", pb_id, file=sys.stdout)
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fd[x].seq) >= max_len:
                best_id = x
                best_seq = fd[x].seq
                best_qual = fd[x].letter_annotations[
                    'phred_quality'] if is_fq else None
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            # make coordinates & write the SAM file
            isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID],
                                              r.sID, r.sStart, r.sEnd)
            if r.qID not in coords:
                coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID])
                record_storage[pb_id] = [None] * len(
                    fusion_candidate_ranges[r.qID])
            coords[r.qID][isoform_index] = "{0}:{1}-{2}({3})".format(
                r.sID, r.sStart, r.sEnd, r.flag.strand)
            record_storage[pb_id][isoform_index] = r

    for pb_id, records in record_storage.items():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, "+".join(coords[best_id]), best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=False):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')


#    for line in open(gff_filename):
#        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PBfusion.1"; transcript_id "PBfusion.1.1";
#        raw = line.strip().split('\t')
#        if raw[2] == 'transcript':
#            # check if this is first or 2+ part of fusion
#            tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1
#            gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1
#            if tid.endswith('.1'):
#                coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])
#            else:
#                assert gid in coords
#                coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.)
                          for i in fd[x].letter_annotations['phred_quality'])
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].seq) >= max_len):
                best_id = x
                best_seq = fd[x].seq
                if is_fq:
                    best_qual = fd[x].letter_annotations['phred_quality']
                    best_err = err
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            best_id, best_seq, best_qual = rep_info[pb_id]

            # make coordinates & write the SAM file
            if r.qID not in coords:
                # this is the .1 portion
                coords[r.qID] = "{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1
                record_storage[pb_id] = [r]
            else:
                # this is the .2 portion, or even .3, .4....! handle fusions with > 2 loci correctly
                coords[r.qID] += "+{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                record_storage[pb_id].append(r)

    for pb_id, records in record_storage.iteritems():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def write_snp_to_vcf(snp_filename, vcf_filename, genome_filename, genome_d=None):
    # read the genome is genome_d is not given
    if genome_d is None:
        genome_d = LazyFastaReader(genome_filename)

    # read the first SNP record so we know the query name
    snp_reader = SNPReader(snp_filename)
    snp_rec = next(snp_reader)
    sample_name = snp_rec.query_name
    cur_recs = [snp_rec]
    genome_rec = genome_d[snp_rec.ref_name]

    with open('template.vcf', 'w') as f:
        f.write(__VCF_EXAMPLE__ + '\n')
    reader = vcf.VCFReader(open('template.vcf'))
    reader.samples = [sample_name]
    f_vcf = vcf.Writer(open(vcf_filename, 'w'), reader)

    for r1 in snp_reader:
        if r1.ref_pos == cur_recs[-1].ref_pos:  # multi-nt insertion, keep recording
            cur_recs.append(r1)
        elif r1.query_base == '.' and cur_recs[-1].query_base == '.': # multi-nt deletion, keep recording
            cur_recs.append(r1)
        else: # time to write out the current set of records
            # multiple records mean it could be:
            # 1. multi-nucleotide insertions
            # 2. multi-nucleotide deletions

            if len(cur_recs) == 1 and cur_recs[0].ref_base!='.' and cur_recs[0].query_base!='.': # just a SNP record
                pos = cur_recs[0].ref_pos
                ref_base = cur_recs[0].ref_base
                alt_base = cur_recs[0].query_base
            elif cur_recs[0].ref_base=='.':
                # is a single or multi-nt insertions, must retrieve ref base from genome
                # ex: in out.snps_files it is . --> ATG
                # in VCF it should be T --> TATG (meaning insertion of ATG)
                pos = cur_recs[0].ref_pos
                ref_base = genome_rec[cur_recs[0].ref_pos]
                alt_base = ref_base + "".join(r.query_base for r in cur_recs)
            else:
                # is a single multi-nt deletions, we need to get one more ref base before the first deletion
                # ex: in out.snps_files it is GGG --> deletion
                # in VCF it should be TGGG --> T (meaning deletion of GGG)
                pos = cur_recs[0].ref_pos-1
                ref_base_prev = genome_rec[pos]
                ref_base = ref_base_prev + "".join(r.ref_base for r in cur_recs)
                alt_base = ref_base_prev

            rec = vcf.model._Record(CHROM=snp_rec.ref_name,
                                POS=pos+1,
                                ID='.',
                                REF=ref_base,
                                ALT=[vcf.model._Substitution(alt_base)],
                                QUAL='.', FILTER='PASS',
                                INFO={'AF':0.5},
                                FORMAT="GT",
                                sample_indexes=None)
            samp_ft = vcf.model.make_calldata_tuple(['GT'])
            rec.samples.append(vcf.model._Call(rec, sample_name, samp_ft(*["0|1"])))
            f_vcf.write_record(rec)
            if r1.ref_name != cur_recs[0].ref_name:
                genome_rec = genome_d[r1.ref_name]
            cur_recs = [r1]
    f_vcf.close()
Exemple #10
0
__author__ = 'lachesis'

import os, sys
from Bio import SeqIO
from cupcake.io.SeqReaders import LazyFastaReader
from cupcake2.io.FileIO import write_seqids_to_fasta

input = 'isoseq_flnc.fasta'

NUM_SEQS_PER_BATCH = 200000

d = LazyFastaReader(input)

lens = [(r.id, len(r.seq)) for r in SeqIO.parse(open(input), 'fasta')]
lens.sort(key=lambda x: x[1], reverse=True)

n = len(lens)

# start at 1% of the data
starting_seed_index = n * 1 / 100
good = [x[0] for x in lens[starting_seed_index:starting_seed_index+NUM_SEQS_PER_BATCH]]
write_seqids_to_fasta(good, 'seed0.fasta', d)

batch_index = 1
starting_index = starting_seed_index+NUM_SEQS_PER_BATCH
while starting_index < n:
    write_seqids_to_fasta([x[0] for x in lens[starting_index:starting_index+NUM_SEQS_PER_BATCH]], \
                          "batch{0}.fasta".format(batch_index), d)
    starting_index += NUM_SEQS_PER_BATCH
    batch_index += 1
    # sanity checking of input files
    for line in open(snps_filename):
        filename = line.strip()
        if not filename.endswith('.snps'):
            print >> sys.stderr, "Input files listed in {0} must end with .snps_files!".format(snps_filename)
            sys.exit(-1)
        if not os.path.exists(filename):
            print >> sys.stderr, "{0} does not exist! Abort.".format(filename)
            sys.exit(-1)
        snps_files.append(filename)

    if not os.path.exists(genome_filename):
        print >> sys.stderr, "Genome file {0} does not exist!".format(genome_filename)

    print >> sys.stderr, "Reading genome file {0}....".format(genome_filename)
    genome_d = LazyFastaReader(genome_filename)

    # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it
    keys = genome_d.keys()
    for k in keys:
        k2 = k.split('|')[0]
        if k2!=k and k2 not in keys:
            genome_d.d[k2] = genome_d.d[k]
            print >> sys.stderr, "Detected | string in chromosome ID, stripping {0} to {1}....".format(k, k2)
    print >> sys.stderr, "Finished reading genome."

    for snp_file in snps_files:
        assert snp_file.endswith('.snps')
        vcf_file = snp_file[:-5] + '.vcf'
        print >> sys.stderr, "Processing {0} --> {1}".format(snp_file, vcf_file)
        write_snp_to_vcf(snp_file, vcf_file, genome_filename, genome_d)