Beispiel #1
0
def pick_longest_rep(fasta_filename, gff_filename, group_filename,
                     output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript':
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4],
                                                    raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id),
                         best_seq)
    fout.close()
def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split("\t")
        if raw[2] == "transcript":
            tid = raw[-1].split("; ")[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split("\t")
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(","):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq)
    fout.close()
Beispiel #3
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == "fasta" else FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + ".pickle", "w") as f:
            dump(self.dazz_mapping, f)
Beispiel #4
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == 'fasta' else \
            FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + '.pickle', 'w') as f:
            dump(self.dazz_mapping, f)