Example #1
0
 def get_reads():
     for i, (seq, count) in enumerate(self.read_file('common_unmapped')['non_long_polyA'].most_common()):
         read = fastq.Read('{0}_{1}'.format(i, count),
                           seq,
                           fastq.encode_sanger([40]*len(seq)),
                          )
         yield read
Example #2
0
def make_artificial_reads(
    transcript,
    fragment_length,
    read_length,
    adapter_sequence,
    region_fetcher,
    common_buffer,
):
    transcript_sequence = transcript.retrieve_sequence(
        region_fetcher,
        left_buffer=common_buffer,
        right_buffer=common_buffer + fragment_length,
    )
    # Needs to include one non-Solexa value for automatic encoding recognition.
    high_quals = fastq.encode_sanger([25] + [30] * (read_length - 1))
    for i, transcript_position in enumerate(
            range(-common_buffer, transcript.CDS_length + common_buffer)):
        annotation = artifical_annotation(
            transcript_name=transcript.name,
            position=transcript_position,
        )
        fragment_sequence = transcript_sequence[i:i + fragment_length]
        if '-' in fragment_sequence:
            # skip fragments that run off the edge of a reference sequence
            continue

        full_sequence = fragment_sequence + adapter_sequence
        read = fastq.Read(annotation.identifier, full_sequence[:read_length],
                          high_quals)
        yield read
 def get_reads():
     for i, (seq, count) in enumerate(counts.read_file(unmapped_fn).most_common()):
         read = fastq.Read('{0}_{1}'.format(i, count),
                           seq,
                           fastq.encode_sanger([40]*len(seq)),
                          )
         yield read
Example #4
0
def make_artificial_reads(transcript,
                          fragment_length,
                          read_length,
                          adapter_sequence,
                          region_fetcher,
                          common_buffer,
                         ):
    transcript_sequence = transcript.retrieve_sequence(region_fetcher,
                                                       left_buffer=common_buffer,
                                                       right_buffer=common_buffer + fragment_length,
                                                      )
    # Needs to include one non-Solexa value for automatic encoding recognition.
    high_quals = fastq.encode_sanger([25] + [30]*(read_length - 1))
    for i, transcript_position in enumerate(range(-common_buffer, transcript.CDS_length + common_buffer)):
        annotation = artifical_annotation(transcript_name=transcript.name,
                                          position=transcript_position,
                                         )
        fragment_sequence = transcript_sequence[i:i + fragment_length]
        if '-' in fragment_sequence:
            # skip fragments that run off the edge of a reference sequence
            continue

        full_sequence = fragment_sequence + adapter_sequence
        read = fastq.Read(annotation.identifier, full_sequence[:read_length], high_quals)
        yield read