Ejemplo n.º 1
0
def write_flanks_to_unpaired_fasta(pairs, fasta_prefix):

    fiveprime_outseqs = []
    threeprime_outseqs = []
    for p in pairs:
        name5p = p['pair_id'] + '_1'
        name3p = p['pair_id'] + '_2'
        seq5p = p['seq_5p']
        seq3p = p['seq_3p']

        record5p = SeqRecord(Seq(seq5p, IUPAC.IUPACAmbiguousDNA),
                             id=str(name5p),
                             description=str(name5p) + '_5p')
        record3p = SeqRecord(Seq(revcomp(seq3p), IUPAC.IUPACAmbiguousDNA),
                             id=str(name3p),
                             description=str(name3p) + '_3p')

        fiveprime_outseqs.append(record5p)
        threeprime_outseqs.append(record3p)

    with open(fasta_prefix + '.fasta', 'w') as output_handle:
        SeqIO.write(fiveprime_outseqs, output_handle, 'fasta')

    with open(fasta_prefix + '.fasta', 'a') as output_handle:
        SeqIO.write(threeprime_outseqs, output_handle, 'fasta')
Ejemplo n.º 2
0
    def get_inferred_sequence(self, forward_read, reverse_read, is_reverse):
        contig = forward_read.reference_name
        start = forward_read.reference_start
        end = reverse_read.reference_end

        inferred_sequence = ''.join(self.genome_dict[contig][start:end])

        inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \
                            inferred_sequence + \
                            sctools.right_softclipped_sequence_strict(reverse_read)

        inferred_sequence = inferred_sequence[self.context_width:-self.context_width]

        if is_reverse:
            inferred_sequence = misc.revcomp(inferred_sequence)

        contig_edge = False
        if sctools.is_left_softclipped_strict(forward_read) and \
                        sctools.left_softclipped_position(forward_read) < 0:
            contig_edge = True
        elif sctools.is_right_softclipped_strict(reverse_read) and \
                        sctools.right_softclipped_position(reverse_read) >= len(self.genome_dict[contig]):
            contig_edge = True


        return inferred_sequence, contig_edge
Ejemplo n.º 3
0
def get_seq_lengths(clusters, seqs, header=['cluster', 'num_unique_seqs', 'mean_length', 'min_length', 'max_length']):

    seq_lengths1 = defaultdict(set)

    for cluster, seq in zip(clusters, seqs):
        seq_lengths1[cluster].add(seq)

    seq_lengths2 = defaultdict(lambda: {'unique_seqs': set(), 'num_unique_seqs': 0,
                                        'mean_length': 0, 'min_length': 0, 'max_length': 0})
    for cluster in seq_lengths1:
        for seq in seq_lengths1[cluster]:
            seq_lengths2[cluster]['unique_seqs'].add(tuple(sorted([seq, misc.revcomp(seq)])))

    for cluster in seq_lengths2:

        all_seq_lengths = list(map(lambda x: len(x[0]), seq_lengths2[cluster]['unique_seqs']))
        seq_lengths2[cluster]['num_unique_seqs'] = len(all_seq_lengths)
        seq_lengths2[cluster]['mean_length'] = np.mean(all_seq_lengths)
        seq_lengths2[cluster]['min_length'] = np.min(all_seq_lengths)
        seq_lengths2[cluster]['max_length'] = np.max(all_seq_lengths)

    seq_lengths2 = [tuple([cluster] + [seq_lengths2[cluster][lab] for lab in header[1:]]) for cluster in seq_lengths2]
    seq_lengths2 = pd.DataFrame(seq_lengths2, columns=header)

    return seq_lengths2
Ejemplo n.º 4
0
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False):

    inferred_sequences = []
    for read1, read2 in pairs:
        if read1.query_name.count('_') == 2:
            context_width = int(read1.query_name.split('_')[-2])
            name = read1.reference_name + ':' + str(read1.reference_start+context_width) + '-' + str(read2.reference_end-context_width)

            inferred_sequence = genome_dict[read1.reference_name][read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(read1) + inferred_sequence + sctools.right_softclipped_sequence_strict(read2)

            inferred_sequence = inferred_sequence[context_width:-context_width]

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        else:
            name = read1.reference_name + ':' + str(read1.reference_start) + '-' + str(read2.reference_end)
            inferred_sequence = genome_dict[read1.reference_name][read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(read1) + inferred_sequence + sctools.right_softclipped_sequence_strict(read2)

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                            sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                            sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        inferred_sequences.append((name, len(inferred_sequence), contig_edge, inferred_sequence))

    return inferred_sequences
Ejemplo n.º 5
0
    def get_inferred_sequence(self, forward_read, reverse_read, is_reverse):
        contig, start, end = forward_read.reference_name, forward_read.reference_start, reverse_read.reference_end
        inferred_sequence = ''.join(self.genome_dict[contig][start:end])

        inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \
                            inferred_sequence + \
                            sctools.right_softclipped_sequence_strict(reverse_read)
        if is_reverse:
            inferred_sequence = misc.revcomp(inferred_sequence)

        return inferred_sequence
Ejemplo n.º 6
0
def parse_cdhit_output_100p(unclustered_fasta, cdhit_output):

    final_cluster_mappings = dict()

    seqdict_100p = dict()
    for rec in SeqIO.parse(cdhit_output, 'fasta'):
        fwd = str(rec.seq)
        rev = misc.revcomp(fwd)

        seq_key = tuple(sorted([fwd, rev]))
        seqdict_100p[seq_key] = rec

    for rec in SeqIO.parse(unclustered_fasta, 'fasta'):
        fwd = str(rec.seq)
        rev = misc.revcomp(fwd)

        seq_key = tuple(sorted([fwd, rev]))

        cluster = seqdict_100p[seq_key].id

        final_cluster_mappings[rec.id] = cluster

    return final_cluster_mappings
Ejemplo n.º 7
0
    def retrieve_extended_sequence(self, orient):
        outsam = pysam.AlignmentFile(self.align_sam_path, 'r')

        extension = None
        for read in outsam:
            if read.is_unmapped:
                return None

            query_length = len(read.query_sequence)

            current_contig = None
            for contig in self.get_assembled_sequences():
                if contig.id == read.reference_name:
                    current_contig = str(contig.seq)

            if orient == 'R' and (not read.is_reverse):
                if len(current_contig) - read.reference_start <= query_length:
                    return None
                softclip = left_softclipped_sequence_strict(read)
                extension = softclip + current_contig[read.reference_start:]
            elif orient == 'R' and read.is_reverse:
                if read.reference_end <= query_length:
                    return None
                softclip = right_softclipped_sequence_strict(read)
                extension = revcomp(current_contig[:read.reference_end] + softclip)
            elif orient == 'L' and (not read.is_reverse):
                if read.reference_end <= query_length:
                    return None
                softclip = right_softclipped_sequence_strict(read)
                extension = current_contig[:read.reference_end] + softclip
            elif orient == 'L' and read.is_reverse:
                if len(current_contig) - read.reference_start <= query_length:
                    return None
                softclip = left_softclipped_sequence_strict(read)
                extension = revcomp(softclip + current_contig[read.reference_start:])

        return extension
Ejemplo n.º 8
0
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False):

    inferred_sequences = []
    for read1, read2 in pairs:

        name = read1.reference_name + ':' + str(read1.reference_start) + '-' + str(read2.reference_end)
        inferred_sequence = genome_dict[read1.reference_name][read1.reference_start:read2.reference_end]

        if add_softclipped_bases:
            inferred_sequence = sctools.left_softclipped_sequence_strict(read1) + inferred_sequence + sctools.right_softclipped_sequence_strict(read2)

        if read1.is_read2:
            inferred_sequence = misc.revcomp(inferred_sequence)

        inferred_sequences.append((name, len(inferred_sequence), inferred_sequence))

    return inferred_sequences
Ejemplo n.º 9
0
def add_sequence_to_secondary_alignment(sam_file_in, sam_file_out):
    outfile = open(sam_file_out, 'w')
    infile = pysam.AlignmentFile(sam_file_in, 'r')
    outfile = pysam.AlignmentFile(sam_file_out, "w", template=infile)

    current_seq = None
    current_seq_reverse = None
    for read in infile:
        if read.query_sequence is not None:
            current_seq = read.query_sequence
            current_seq_reverse = read.is_reverse
        else:
            if current_seq_reverse == read.is_reverse:
                read.query_sequence = current_seq
            else:
                read.query_sequence = misc.revcomp(current_seq)

        outfile.write(read)
    outfile.close()
Ejemplo n.º 10
0
def cluster_100p(infile, outfile):

    seqdict1 = dict()
    for rec in SeqIO.parse(infile, 'fasta'):
        seq = str(rec.seq)

        if seq not in seqdict1:
            seqdict1[seq] = rec

    seqdict2 = dict()
    for seq in seqdict1:
        fwd = seq
        rev = misc.revcomp(seq)

        seq_key = tuple(sorted([fwd, rev]))

        if seq_key not in seqdict2:
            seqdict2[seq_key] = seqdict1[seq]

    with open(outfile, "w") as handle:
        SeqIO.write(seqdict2.values(), handle, "fasta")