Esempi in Python per SeqRecord, esempi in Python per Bio.SeqIO.SeqRecord

Esempio n. 1

0

Mostra file

def test_filter_by_size() -> None:
    """Tests the function filter_by_size. Run with pytest"""
    valid_seq = SeqRecord(seq="ACTGCTG", id="valid")
    invalid_seq = SeqRecord(seq="ACTG", id="invalid")
    min_size = 5

    assert filter_by_size(valid_seq, min_size) is True
    assert filter_by_size(invalid_seq, min_size) is False

Esempio n. 2

0

Mostra file

File: fasta_tools.py Progetto: zmactep/ig-assist-tools

def write_fasta(filename, data):
    fd = open(filename, "w")
    seq_list = []
    for i in data.keys():
        seq_list.append(SeqRecord(Seq(data.get(i)), id=i, description=""))
    SeqIO.write(seq_list, fd, "fasta")
    fd.close()

Esempio n. 3

0

Mostra file

    def unite_aligns(dirname,
                     outfile,
                     full_length,
                     ignore_gaps=False,
                     ignore_level=0.9):
        """
        Uniting consensuses of aligning from directory into one fasta file.

        :param dirname: directory name with aligns
        :param outfile: out filename
        :param full_length: boolean, True or False. If True the confidence level will be done using all length of
        sequences. For False parameter calculating will be done without 'full gap' ends of sequences
        :param ignore_gaps: boolean, True or False. Ignoring gaps with high level of confidence
        (with confidence >= ignore_level)
        :param ignore_level: float, level of ignoring gaps
        """
        filenames = Extractor.extract_filenames(dirname)
        records = []
        for file in filenames:
            align = AlignController.__get_alignment_from(file)
            consensus = align.get_consensus(full_length=full_length)
            str_cons = consensus.get_str_consensus(ignore_gaps=ignore_gaps,
                                                   ignore_level=ignore_level)
            name = f"{basename(file).split('.')[0]}:"
            description = f"consensus sequence with parameters: full_length={full_length}, ignore_gaps={ignore_gaps}, " \
                          f"gnore_level={ignore_level}"
            records.append(
                SeqRecord(Seq(str_cons),
                          id=name,
                          name=name,
                          description=description))
        RecordsWriter(records).write_to(outfile)

Esempio n. 4

0

Mostra file

File: fasta_tools.py Progetto: zmactep/ig-assist-tools

def write_fastq_handle(handle, data):
    seq_list = []
    for i in data.keys():
        seq_list.append(
            SeqRecord(Seq(data.get(i)),
                      id=i,
                      description="",
                      letter_annotations={'solexa_quality': data.getqual(i)}))
    SeqIO.write(seq_list, handle, "fastq")

Esempio n. 5

0

Mostra file

File: SeqReaders.py Progetto: josator/cDNA_Cupcake

 def __getitem__(self, k):
     if k not in self.d:
         raise Exception, "key {0} not in dictionary!".format(k)
     self.f.seek(self.d[k])
     content = ''
     for line in self.f:
         if line.startswith('>'):
             break
         content += line.strip()
     return SeqRecord(seq=Seq(content), id=k)

Esempio n. 6

0

Mostra file

File: SeqReaders.py Progetto: josator/cDNA_Cupcake

    def __getitem__(self, k):
        if k not in self.d:
            raise Exception, "key {0} not in dictionary!".format(k)
        self.f.seek(self.d[k])

        sequence = self.f.readline().strip()
        assert self.f.readline().startswith('+')
        qualstr = self.f.readline().strip()
        return SeqRecord(seq=Seq(sequence), id=k, \
                         letter_annotations={'phred_quality':[ord(x)-33 for x in qualstr]})

Esempio n. 7

0

Mostra file

File: fasta_tools.py Progetto: zmactep/ig-assist-tools

def write_fastq(filename, data):
    fd = open(filename, "w")
    seq_list = []
    for i in data.keys():
        seq_list.append(
            SeqRecord(Seq(data.get(i)),
                      id=i,
                      description="",
                      letter_annotations={'solexa_quality': data.getqual(i)}))
    SeqIO.write(seq_list, fd, "fastq")
    fd.close()

Esempio n. 8

0

Mostra file

def select_positive_and_negative_reads_with_bowtie(reads, vntr_finder, label):
    working_dir = bowtie_working_dir + '/%s/' % vntr_finder.reference_vntr.id
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)
    fq_file = working_dir + label + '.fa'

    records = []
    for i, read in enumerate(reads):
        record = SeqRecord('')
        record.seq = Seq.Seq(read)
        record.id = 'read_%s/1' % str(i)
        records.append(record)
    with open(fq_file, 'w') as output_handle:
        SeqIO.write(records, output_handle, 'fasta')

    passed_time = time()
    bowtie_bamfile = align_with_bowtie(fq_file)
    bowtie_selected = len(
        get_id_of_reads_mapped_to_vntr_in_bamfile(bowtie_bamfile,
                                                  vntr_finder.reference_vntr))
    return float(bowtie_selected), float(len(reads) -
                                         bowtie_selected), time() - passed_time

Esempio n. 9

0

Mostra file

def test_seq_with_xgroups() -> None:
    """Test if the module masked_seqs_stats can successfully find the groups of masked
    vector subsequences (xgroups) in a sequence that contains one of them
    """

    seq = SeqRecord(
        seq=
        ("AAGCXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCGA"
         "GACGGCCGCCCGGGCAGGTACACCCAAGGATTTAATCGTCAAACCATGACGGGTCTCGAAAATCGAAA"
         "CGGACAACATGACAAGGAAATGGGCCCGATGATGAACGAAGTCACCAGACCGAGATACATCAGGGACG"
         "ATAAGAATGCCAAAATTATCGACACATCGGTGGAAAC"),
        id="DT319107.1",
        name="DT319107.1",
    )
    seq_class, seq_xgroups = masked_seqs_stats.find_x_regions_and_calculate_stats(
        seq)

    # Check the ammount of xgroups
    assert len(seq_xgroups) == 1

    # Check the length of the xgroup
    pattern = re.compile(r"X+")
    substring = pattern.search(str(seq.seq))

    assert len(substring.group(0)) == seq_xgroups[0].xgroup_len

    # Check the xgroup distance to 5'
    first = seq.seq.find("X")

    assert first == seq_xgroups[0].dist_from_5

    # Check the xgroup distance to 3'
    last = seq.seq.rfind("X")
    dist = len(seq.seq) - last - 1

    assert dist == seq_xgroups[0].dist_from_3

    # Check the seq class
    assert seq_class == 3

Esempio n. 10

0

Mostra file

def test_seq_without_xgroups() -> None:
    """Test if the module masked_seqs_stats can successfully take a sequence that has
    no vector masked subsequences and produce the expected empty output
    """

    seq = SeqRecord(
        seq=("ACCTATAGGTTGTCGTCGACAAAGAAATGAATCAACTTCCTCTGGTGGTT"
             "CATGGCAAATGATATCTGGAACTGGTAGTTTACGTGGTTCAACAACAGCC"
             "CACACATCTATTACAGAGGGATCTAATTCTTCTGGCTCGACTAGCAAAGG"
             "TTTATTTGAAAATTTTTTACATCAAGCTCATGGATCTAGTAAAGCAATAT"
             "TGGAAGATGACGAATCCGTATCACAAGTACCTGCCCGGGCGGCCGCTCGA"
             "AAGCCG"),
        id="DT319104.1",
        name="DT319104.1",
    )
    seq_class, seq_xgroups = masked_seqs_stats.find_x_regions_and_calculate_stats(
        seq)

    # Check if it has no xgroups
    assert len(seq_xgroups) == 0

    # Check if the seq class is invalid (i.e., 0)
    assert seq_class == 0

Esempio n. 11

0

Mostra file

File: fasta_tools.py Progetto: zmactep/ig-assist-tools

def write_fasta_handle(handle, data):
    seq_list = []
    for i in data.keys():
        seq_list.append(SeqRecord(Seq(data.get(i)), id=i, description=""))
    SeqIO.write(seq_list, handle, "fasta")

Esempio n. 12

0

Mostra file

File: read_data.py Progetto: dunleavy005/samm

def impute_ancestors_dnapars(seqs,
                             gl_seq,
                             scratch_dir,
                             gl_name='germline',
                             verbose=True):
    """
    Compute ancestral states via maximum parsimony

    @param seqs: list of sequences
    @param gl_seq: germline sequence
    @param scratch_dir: where to write intermediate dnapars files
    @param gl_name: name of germline (must be less than 10 characters long)

    @return genes_line: information needed to output imputed germline data
    @return seqs_line: information needed to output imputed sequence data
    """
    from gctree.bin.phylip_parse import parse_outfile

    assert (len(gl_name) < 10)

    infile, config, outfile = [
        os.path.join(scratch_dir, fname) for fname in [
            'infile',
            'dnapars.cfg',
            'outfile',
        ]
    ]

    aln = MultipleSeqAlignment([SeqRecord(Seq(gl_seq), id=gl_name)])

    # sequence ID must be less than ten characters, but also dnapars sets internal node
    # names to 1, 2, 3, ..., so name them numbers descending from 100 million, hoping
    # we won't ever have a clone that big...
    for idx, seq in enumerate(seqs):
        aln.append(SeqRecord(Seq(seq), id=str(99999999 - idx)))

    # dnapars uses the name "infile" as default input phylip file
    with open(infile, 'w') as phylip_file:
        phylip_file.write(aln.format('phylip'))

    # and we need to tell it the line where the root sequence occurs
    with open(infile, 'r') as phylip_file:
        for lineno, line in enumerate(phylip_file):
            if line.startswith(gl_name):
                naive_idx = str(lineno)

    # arcane user options for dnapars
    # 'O', naive_idx: the location of the outgroup root
    # 'S', 'Y': less thorough search; runs much faster but output is less exhaustive
    # 'J', 13, 10: randomize input ("jumble") using seed 13 and jumbling 10 times
    # 4: print out steps in each site (to get all nucleotide info)
    # 5: print sequences in at all nodes (to get ancestors)
    # '.': use dot-differencing for display
    # 'Y': accept these options
    with open(config, 'w') as cfg_file:
        cfg_file.write('\n'.join(
            ['O', naive_idx, 'S', 'Y', 'J', '13', '10', '4', '5', '.', 'Y']))

    # defer to command line to construct parsimony trees and ancestral states
    # dnapars has weird behavior if outfile and outtree already exist o_O
    cmd = [
        'cd', scratch_dir, '&& rm -f outfile outtree && dnapars <',
        os.path.basename(config), '> dnapars.log'
    ]
    if verbose:
        print "Calling:", " ".join(cmd)
    res = subprocess.call([" ".join(cmd)], shell=True)

    # phew, finally got some trees
    trees = parse_outfile(outfile, countfile=None, naive=gl_name)

    # take first parsimony tree
    genes_line = []
    seq_line = []
    for idx, descendant in enumerate(trees[0].traverse('preorder')):
        if descendant.is_root():
            descendant.name = gl_name
        else:
            # use dummy name for internal node sequences
            descendant.name = '-'.join([descendant.up.name, descendant.name])
            if [descendant.up.name,
                    descendant.up.sequence.lower()] not in genes_line:
                genes_line.append(
                    [descendant.up.name,
                     descendant.up.sequence.lower()])
            seq_line.append([
                descendant.up.name, descendant.name,
                descendant.sequence.lower()
            ])

    return genes_line, seq_line

Esempio n. 13

0

Mostra file

File: pdb_aligner.py Progetto: vicenteblat/BIOL4803

# Get protein sequence
ppb = Bio.PDB.PPBuilder()
polypeptides = ppb.build_peptides(structure)
seq1 = polypeptides[0].get_sequence()
seq2 = polypeptides[1].get_sequence()

matrix = matlist.blosum62
gap_open = -10
gap_extend = -0.5

alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
top_aln = alns[0]

alignment = MultipleSeqAlignment(
    [SeqRecord(Seq(top_aln[0])),
     SeqRecord(Seq(top_aln[1]))])
structure_alignment = Bio.PDB.StructureAlignment(alignment, structure[0]['A'],
                                                 structure[0]['B'])

sup = Bio.PDB.Superimposer()
ref_atoms = []
mov_atoms = []
for duo in structure_alignment.duos:
    res1 = duo[0]
    res2 = duo[1]
    if res1 and res2:
        ref_atoms.append(res1['CA'])
        mov_atoms.append(res2['CA'])

sup.set_atoms(ref_atoms, mov_atoms)

Esempio n. 14

0

Mostra file

def get_hmm_accuracy(vntr_finder, simulated_true_reads,
                     simulated_false_filtered_reads):
    output_dir = result_dir + '/%s/' % vntr_finder.reference_vntr.id

    print('running BLAST')
    from blast_wrapper import get_blast_matched_ids, make_blast_database
    blast_dir = output_dir + 'blast_dir/'
    if not os.path.exists(blast_dir):
        os.makedirs(blast_dir)
    vntr_id = vntr_finder.reference_vntr.id
    fasta_file = blast_dir + 'reads.fasta'
    records = []
    for i, read in enumerate(simulated_false_filtered_reads):
        records.append(SeqRecord(seq=Seq.Seq(read), id='fasle_%s' % i))
    for i, read in enumerate(simulated_true_reads):
        records.append(SeqRecord(seq=Seq.Seq(read), id='true_%s' % i))
    with open(fasta_file, 'w') as output_handle:
        SeqIO.write(records, output_handle, 'fasta')

    make_blast_database(fasta_file, blast_dir + 'blast_db_%s' % vntr_id)
    query = '@'.join(get_blast_keywords(vntr_finder.reference_vntr))
    search_id = 'search_id'
    search_results = get_blast_matched_ids(query,
                                           blast_dir + 'blast_db_%s' % vntr_id,
                                           max_seq='100000',
                                           word_size='7',
                                           evalue=sys.maxsize,
                                           search_id=search_id,
                                           identity_cutoff='100',
                                           blast_tmp_dir=blast_dir)
    from collections import Counter
    res = Counter(search_results)
    filtered = [item for item, occur in res.items() if occur >= 2]
    print('BLAST results computed')

    print(len(filtered))
    print(len(simulated_true_reads))
    print(len(simulated_false_filtered_reads))
    tp = float(len([e for e in filtered if e.startswith('true')]))
    fp = float(len([e for e in filtered if e.startswith('false')]))
    fn = float(len(simulated_true_reads) - tp)
    tn = float(len(simulated_false_filtered_reads) - fp)
    train_time = 0
    passed_time = 0

    precision = tp / (tp + fp) if tp > 0 else 0
    recall = tp / (tp + fn)
    accuracy = (100 * (tp + tn) / (fp + fn + tp + tn))
    print('BLAST:')
    print(tp, fp, fn, tn)
    print('Precision:', precision)
    print('Recall:', recall)
    print('acc: %s' % accuracy)

    with open(output_dir + '/blast.txt', 'w') as outfile:
        outfile.write('%s\n' % train_time)
        outfile.write('%s\n' % passed_time)
        outfile.write('%s\n' % precision)
        outfile.write('%s\n' % recall)
        outfile.write('%s\n' % accuracy)
        outfile.write('%s,%s,%s,%s\n' % (tp, fn, fp, tn))
    return passed_time

    output_dir = result_dir + '/%s/' % vntr_finder.reference_vntr.id
    if os.path.exists(output_dir +
                      '/hmm.txt') and os.path.getsize(output_dir +
                                                      '/hmm.txt') > 0:
        if sum(1 for _ in open(output_dir + 'hmm.txt')) > 5:
            print('HMM info is already calculated')
            with open(output_dir + 'hmm.txt') as infile:
                lines = infile.readlines()
                return float(lines[1])

    train_true_reads = [
        read for i, read in enumerate(simulated_true_reads) if i % 2 == 0
    ]
    train_false_reads = [
        read for i, read in enumerate(simulated_false_filtered_reads)
        if i % 2 == 0
    ]
    test_true_reads = [
        read for i, read in enumerate(simulated_true_reads) if i % 2 == 1
    ]
    test_false_reads = [
        read for i, read in enumerate(simulated_false_filtered_reads)
        if i % 2 == 1
    ]

    start_time = time()
    hmm = vntr_finder.get_vntr_matcher_hmm(read_length=read_length)

    processed_true_reads = vntr_finder.find_hmm_score_of_simulated_reads(
        hmm, train_true_reads)
    processed_false_reads = vntr_finder.find_hmm_score_of_simulated_reads(
        hmm, train_false_reads)

    recruitment_score = vntr_finder.find_recruitment_score_threshold(
        processed_true_reads, processed_false_reads)
    train_time = time() - start_time
    print('HMM train time: %s' % train_time)

    tp = 0.0
    fn = 0.0
    tn = 0.0
    fp = 0.0
    start_time = time()
    true_reads = vntr_finder.find_hmm_score_of_simulated_reads(
        hmm, test_true_reads)
    false_reads = vntr_finder.find_hmm_score_of_simulated_reads(
        hmm, test_false_reads)
    passed_time = time() - start_time
    for read in true_reads:
        if read.logp > recruitment_score:
            tp += 1
        else:
            fn += 1
    for read in false_reads:
        if read.logp > recruitment_score:
            fp += 1
        else:
            tn += 1
    precision = tp / (tp + fp) if tp > 0 else 0
    recall = tp / (tp + fn)
    accuracy = (100 * (tp + tn) / (fp + fn + tp + tn))
    print('HMM: %s' % passed_time)
    print(tp, fp, fn, tn)
    print('Precision:', precision)
    print('Recall:', recall)
    print('acc: %s' % accuracy)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(output_dir + '/hmm.txt', 'w') as outfile:
        outfile.write('%s\n' % train_time)
        outfile.write('%s\n' % passed_time)
        outfile.write('%s\n' % precision)
        outfile.write('%s\n' % recall)
        outfile.write('%s\n' % accuracy)
        outfile.write('%s,%s,%s,%s\n' % (tp, fn, fp, tn))
    return passed_time

Esempio n. 15

0

Mostra file

def pick_rep(
    fa_fq_filename,
    sam_filename,
    gff_filename,
    group_filename,
    output_filename,
    fusion_candidate_ranges,
    is_fq=False,
):
    """
    For each group, select the representative record
    Always pick the longest one!
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, "w")
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, "w")

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split("\t")
        logger.info(f"Picking representative sequence for {pb_id}")
        best_id = None
        best_seq = None
        best_qual = None
        max_len = 0
        for x in members.split(","):
            if len(fd[x].seq) >= max_len:
                best_id = x
                best_seq = fd[x].seq
                best_qual = fd[x].letter_annotations[
                    "phred_quality"] if is_fq else None
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, "w")
    coords = {}
    record_storage = (
        {}
    )  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            # make coordinates & write the SAM file
            isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID],
                                              r.sID, r.sStart, r.sEnd)
            if r.qID not in coords:
                coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID])
                record_storage[pb_id] = [None] * len(
                    fusion_candidate_ranges[r.qID])
            coords[r.qID][
                isoform_index] = f"{r.sID}:{r.sStart}-{r.sEnd}({r.flag.strand})"
            record_storage[pb_id][isoform_index] = r

    for pb_id, records in record_storage.items():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write(
                f"{r.sID}\tPacBio\ttranscript\t{r.segments[0].start + 1}\t"
                f"{r.segments[-1].end}\t.\t{r.flag.strand}\t.\t"
                f'gene_id "{pb_id}"; '
                f'transcript_id "{pb_id}.{isoform_index}";\n')
            for s in r.segments:
                f_gff.write(f"{r.sID}\tPacBio\texon\t"
                            f"{s.start+1}\t{s.end}\t.\t{r.flag.strand}\t.\t"
                            f'gene_id "{pb_id}"; '
                            f'transcript_id "{pb_id}.{isoform_index}";\n')
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = f"{pb_id}|{'+'.join(coords[best_id])}|{best_id}"
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={"phred_quality": best_qual}),
                fout,
                "fastq",
            )
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, "fasta")

Esempio n. 16

0

Mostra file

def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             fusion_candidate_ranges,
             is_fq=False):
    """
    For each group, select the representative record
    Always pick the longest one!
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print("Picking representative sequence for", pb_id, file=sys.stdout)
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fd[x].seq) >= max_len:
                best_id = x
                best_seq = fd[x].seq
                best_qual = fd[x].letter_annotations[
                    'phred_quality'] if is_fq else None
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            # make coordinates & write the SAM file
            isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID],
                                              r.sID, r.sStart, r.sEnd)
            if r.qID not in coords:
                coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID])
                record_storage[pb_id] = [None] * len(
                    fusion_candidate_ranges[r.qID])
            coords[r.qID][isoform_index] = "{0}:{1}-{2}({3})".format(
                r.sID, r.sStart, r.sEnd, r.flag.strand)
            record_storage[pb_id][isoform_index] = r

    for pb_id, records in record_storage.items():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, "+".join(coords[best_id]), best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')

Esempio n. 17

0

Mostra file

File: fusion_finder.py Progetto: ydLiu-HIT/cDNA_Cupcake

def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=False):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')


#    for line in open(gff_filename):
#        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PBfusion.1"; transcript_id "PBfusion.1.1";
#        raw = line.strip().split('\t')
#        if raw[2] == 'transcript':
#            # check if this is first or 2+ part of fusion
#            tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1
#            gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1
#            if tid.endswith('.1'):
#                coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])
#            else:
#                assert gid in coords
#                coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.)
                          for i in fd[x].letter_annotations['phred_quality'])
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].seq) >= max_len):
                best_id = x
                best_seq = fd[x].seq
                if is_fq:
                    best_qual = fd[x].letter_annotations['phred_quality']
                    best_err = err
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            best_id, best_seq, best_qual = rep_info[pb_id]

            # make coordinates & write the SAM file
            if r.qID not in coords:
                # this is the .1 portion
                coords[r.qID] = "{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1
                record_storage[pb_id] = [r]
            else:
                # this is the .2 portion, or even .3, .4....! handle fusions with > 2 loci correctly
                coords[r.qID] += "+{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                record_storage[pb_id].append(r)

    for pb_id, records in record_storage.iteritems():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')