Beispiel #1
0
    def get_num_reads(self):
        """
        Return number of reads in FASTA/FASTQ file.

        For single-end samples, returns a single number.

        For paired-end samples, return a comma-separated
        pair of numbers: 'num_left_mate,num_right_mate'
        """
        self.logger.info("Getting number of reads.")
        if self.sample.paired:
            self.logger.info("Getting number of paired-end reads.")
            # Paired-end
            mate_reads = []
            for mate_rawdata in self.sample.rawdata:
                num_reads = 0
                fastx_entries = fastx_utils.get_fastx_entries(mate_rawdata.reads_filename)
                for entry in fastx_entries:
                    num_reads += 1
                mate_reads.append(num_reads)
            pair_num_reads = ",".join(map(str, mate_reads))
            return pair_num_reads
        else:
            self.logger.info("Getting number of single-end reads.")
            num_reads = 0
            # Single-end
            fastx_entries = fastx_utils.get_fastx_entries(self.sample.rawdata.reads_filename)
            for entry in fastx_entries:
                num_reads += 1
            return num_reads
Beispiel #2
0
    def get_num_reads(self):
        """
        Return number of reads in FASTA/FASTQ file.

        For single-end samples, returns a single number.

        For paired-end samples, return a comma-separated
        pair of numbers: 'num_left_mate,num_right_mate'
        """
        self.logger.info("Getting number of reads.")
        if self.sample.paired:
            self.logger.info("Getting number of paired-end reads.")
            # Paired-end
            mate_reads = []
            for mate_rawdata in self.sample.rawdata:
                num_reads = 0
                fastx_entries = \
                    fastx_utils.get_fastx_entries(mate_rawdata.reads_filename)
                for entry in fastx_entries:
                    num_reads += 1
                mate_reads.append(num_reads)
            pair_num_reads = ",".join(map(str, mate_reads))
            return pair_num_reads
        else:
            self.logger.info("Getting number of single-end reads.")
            num_reads = 0
            # Single-end
            fastx_entries = \
                fastx_utils.get_fastx_entries(self.sample.rawdata.reads_filename)
            for entry in fastx_entries:
                num_reads += 1
            return num_reads
Beispiel #3
0
def output_bed_coords_from_fasta(fasta_fname, bed_fname):
    """
    Output event coordinates from a FASTA file into a BED
    format.

    Assumes FASTA entry is of the form:

      >part_id:coords:entry_type
    """
    print "Converting FASTA %s to BED %s" %(fasta_fname,
                                            bed_fname)
    total_len = 0
    with open(bed_fname, "w") as bed_out:
        for fasta_entry in fastx_utils.get_fastx_entries(fasta_fname):
            fasta_name, fasta_seq = fasta_entry
            # Assume FASTA entry coordinates are in GFF format.
            # Convert them to BED
            if ";" not in fasta_name:
                raise Exception, "Malformed FASTA entry name: %s" %(fasta_name)
            gff_coords = fasta_name.split(";")[1]
            chrom, start, end, strand = parse_gff_coords(gff_coords)
            # Convert start to BED by subtracting one
            start = start - 1
            bed_entry = \
                pybedtools.create_interval_from_list(map(str, [chrom, start, end,
                                                               gff_coords, "1",
                                                               strand]))
            bed_out.write("%s" %(str(bed_entry)))
            # Accumulate total length of FASTA seqs
            total_len += len(fasta_seq)
    return total_len
Beispiel #4
0
def load_fastx_into_ktable(fastx_fname, kmer_len):
    t1 = time.time()
    ktable = khmer.new_ktable(kmer_len)
    # Load up the FASTA into ktable
    for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname):
        fastx_name, fastx_seq = fastx_entry
        # Skip very short sequences
        if len(fastx_seq) < kmer_len:
            continue
        ktable.consume(fastx_seq)
    t2 = time.time()
    print "Loading up of seqs into ktable took %.2f seconds." %(t2 - t1)
    return ktable
Beispiel #5
0
def load_fastx_into_ktable(fastx_fname, kmer_len):
    t1 = time.time()
    ktable = khmer.new_ktable(kmer_len)
    # Load up the FASTA into ktable
    for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname):
        fastx_name, fastx_seq = fastx_entry
        # Skip very short sequences
        if len(fastx_seq) < kmer_len:
            continue
        ktable.consume(fastx_seq)
    t2 = time.time()
    print "Loading up of seqs into ktable took %.2f seconds." % (t2 - t1)
    return ktable
def jf_counts_to_dict(jf_counts_fname):
    """
    Load jellyfish counts file (a FASTA file)
    into a dictionary mapping kmers to counts.
    """
    kmer_counts = defaultdict(int)
    if not os.path.isfile(jf_counts_fname):
        print "Error: Cannot find jf counts file %s" % (jf_counts_fname)
        sys.exit(1)
    for fastx_entry in fastx_utils.get_fastx_entries(jf_counts_fname,
                                                     fasta=True):
        kmer_count, kmer = fastx_entry
        # Remove prefix '>' from FASTA entry
        kmer_count = int(kmer_count[1:])
        kmer_counts[kmer] = kmer_count
    return kmer_counts
Beispiel #7
0
def output_dinuc_shuffled_fasta(fasta_fname, shuffled_fasta_fname,
                                num_shuffles=1):
    """
    Given a FASTA file, output a dinucleotide shuffled version of it.
    """
    fasta_out = fastx_utils.write_open_fastx(shuffled_fasta_fname)
    for fastx_entry in fastx_utils.get_fastx_entries(fasta_fname):
        fastx_name, fastx_seq = fastx_entry
        shuffled_recs = []
        for shuffle_num in range(num_shuffles):
            shuffled_seq = \
                get_dinuc_shuffles(fastx_seq)[0]
            shuffled_rec = (fastx_name, shuffled_seq)
            shuffled_recs.append(shuffled_rec)
        fasta_utils.write_fasta(fasta_out, shuffled_recs)
    fasta_out.close()
Beispiel #8
0
def jf_counts_to_dict(jf_counts_fname):
    """
    Load jellyfish counts file (a FASTA file)
    into a dictionary mapping kmers to counts.
    """
    kmer_counts = defaultdict(int)
    if not os.path.isfile(jf_counts_fname):
        print "Error: Cannot find jf counts file %s" %(jf_counts_fname)
        sys.exit(1)
    for fastx_entry in fastx_utils.get_fastx_entries(jf_counts_fname,
                                                     fasta=True):
        kmer_count, kmer = fastx_entry
        # Remove prefix '>' from FASTA entry
        kmer_count = int(kmer_count[1:])
        kmer_counts[kmer] = kmer_count
    return kmer_counts
Beispiel #9
0
def output_dinuc_shuffled_fasta(fasta_fname,
                                shuffled_fasta_fname,
                                num_shuffles=1):
    """
    Given a FASTA file, output a dinucleotide shuffled version of it.
    """
    fasta_out = fastx_utils.write_open_fastx(shuffled_fasta_fname)
    for fastx_entry in fastx_utils.get_fastx_entries(fasta_fname):
        fastx_name, fastx_seq = fastx_entry
        shuffled_recs = []
        for shuffle_num in range(num_shuffles):
            shuffled_seq = \
                get_dinuc_shuffles(fastx_seq)[0]
            shuffled_rec = (fastx_name, shuffled_seq)
            shuffled_recs.append(shuffled_rec)
        fasta_utils.write_fasta(fasta_out, shuffled_recs)
    fasta_out.close()
def output_gff_event_seqs(event_ids, input_fasta_fname, output_fasta_fname,
                          entry_types=None,
                          suffixes=None,
                          remove_repeats=False):
    """
    Given a set of event ids, pull out their sequences from an
    input fasta filename and output these to a separate FASTA file.

    Return the entries that were outputted.

      - entry_types: optional list of entry types that should be outputted, 
        e.g. 'exon', 'intron'. Skip all entry types not within list.
      - suffixes: optional list of suffixes that the first field of the
        FASTA name should end in. For example, if the FASTA field is:

          >event_id;part_id;entry_type

        Then event_id must end in one of the suffixes for it to be
        included.
    """
    num_events = len(event_ids)
    print "Retrieving sequences for %d events" %(num_events)
    print "  - Input FASTA: %s" %(input_fasta_fname)
    print "  - Output FASTA: %s" %(output_fasta_fname)
    def is_event_fasta(fasta_name):
        """
        Return true if the event is a FASTA one.
        """
        # If there's any event such that the
        # FASTA record starts with that event's name, then
        # the FASTA record should be outputted
        return len(filter(lambda e: \
                          fasta_name.startswith(e),
                          event_ids)) > 0
    kept_fasta_entries = []
    with open(output_fasta_fname, "w") as fasta_out:
        for entry in fastx_utils.get_fastx_entries(input_fasta_fname):
            fasta_name, fasta_seq = entry
            fasta_name_fields = fasta_name.split(";")
            entry_type = fasta_name_fields[2]
            if is_event_fasta(fasta_name[1:]):
                # If given entry types, check that this sequence
                # is of one of the right entry types; otherwise
                # skip it
                if (entry_types is not None) and \
                   (entry_type not in entry_types):
                    # Not of correct entry type
                    continue
                # If given suffixes, check that the first field
                # of the FASTA name ends in one of the suffixes
                if suffixes is not None:
                    if not any([fasta_name_fields[0].endswith(s) \
                                for s in suffixes]):
                        # The first FASTA name field does not end
                        # in any of the suffixes, so skip it.
                        continue
                # If asked, remove repeats from sequence
                if remove_repeats:
                    repeatless_seq = \
                        fasta_seq.translate(None, string.ascii_lowercase)
                    if len(repeatless_seq) == 0:
                        print "%s is all repeat! Not removing" %(fasta_name)
                        continue
                    fasta_seq = repeatless_seq
                fasta_out.write("%s\n" %(fasta_name))
                fasta_out.write("%s\n" %(fasta_seq))
                kept_fasta_entries.append(fasta_name)
    print "Outputted %d entries." %(len(kept_fasta_entries))
    return kept_fasta_entries