Example #1
0
def patch_middle(orig, cov, error, n_error, patches, k, origA, origB, origM,
                 start, stop, buffer):
    """
    use re-assembled fragments to patch mis-assembly in middle of scaffold
    """
    attemptsA = []
    for patch in patches:
        patch = patch[1].upper()
        p_start = patch.find(start)
        if p_start == -1:
            patch = rc(['', patch])[1]
            p_start = patch.find(start)
        if p_start == -1:
            continue
        p = patch[(p_start + k + buffer):]
        attemptsA.append([len(p), p])
        p_stop = patch.find(stop)
        if p_stop == -1:
            patch = rc(['', patch])[1]
            p_stop = patch.find(stop)
        if p_stop == -1:
            continue
        p = patch[(p_start + k + buffer):(p_stop)]
        if len(p) == 0:
            continue
        return [0, p]
    attemptsB = []
    for patch in patches:  # extend origB if possible
        patch = patch[1].upper()
        p_start = patch.find(start)
        if p_start == -1:
            patch = rc(['', patch])[1]
            p_start = patch.find(start)
        if p_start != -1:
            continue
        p_stop = patch.find(stop)
        if p_stop == -1:
            patch = rc(['', patch])[1]
            p_stop = patch.find(stop)
        if p_stop == -1:
            continue
        p = patch[:p_stop]
        attemptsB.append([len(p), p])
    # what if only the start kmer could be found?
    if len(attemptsA) > 0:
        bestA = sorted(attemptsA, key=lambda x: x[0], reverse=True)[0][1]
        if len(bestA) > 20:
            if len(attemptsB) > 0:
                bestB = sorted(attemptsB, key=lambda x: x[0],
                               reverse=True)[0][1]
                if len(bestB) > 20:
                    return [12, [bestA, bestB]]
            return [1, bestA]
    # what if only the stop kmer could be found?
    if len(attemptsB) > 0:
        bestB = sorted(attemptsB, key=lambda x: x[0], reverse=True)[0][1]
        if len(bestB) > 20:
            return [2, bestB]
    return False  # could not find start and stop in any fragment
Example #2
0
def check_rc(seq, coords):
    """
    reverse complement sequence, if necessary
    """
    if coords[0] > coords[1]:
        return rc(['', seq])[1]
    return seq
Example #3
0
def find_16S(fastas,
             hmms,
             bit_thresh=float(20),
             length_thresh=500,
             masking=True,
             buffer=0):
    """
    1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!')
        seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 
    2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches
        seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...]
    3) identify regions that match to 16S (for best model)
    4) mask internal regions that do not align to model
    5) length threshold applies to aligned regions of 16S sequence
    5) export 16S sequnece based on complete gene (including masked insertions)
    """
    # identify start/stop positions
    # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps]
    group2hmm = find_coordinates(hmms, bit_thresh)
    # get sequences from fasta file
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            id = seq[0].split('>')[1].split()[0]
            if id not in group2hmm:
                continue
            seq[1] = seq[1].upper()
            count = 0  # how many 16S genes are there on the contig?
            for group, info in list(group2hmm[id].items()):
                model, strand, coords, matches, gaps = info
                # count insertion bases (ib) from gaps
                ib = sum([i[1] - i[0] + 1 for i in gaps])
                # calcualte length of non-insertion regions (don't include buffer)
                tl = coords[1] - coords[0] + 1
                length = tl - ib
                if length < length_thresh:
                    continue
                # count sequence
                count += 1
                # set retrieval coords based on buffer
                ret_coords = [max([coords[0] - buffer, 1]), \
                        min([coords[1] + buffer, len(seq[1])]), coords[2]]
                buffer_ends = check_buffer(coords, len(seq[1]), buffer)
                # mask insertion sequences
                if masking is True:
                    seq[1] = mask_sequence(seq[1], gaps)
                S = seq[1][(ret_coords[0] - 1):(ret_coords[1])]
                inserts = [gap[1] - gap[0] + 1 for gap in gaps]
                inserts.append('end')
                model_pos = ';'.join([
                    '%s-%s(%s)' % (match[2], match[3], insert)
                    for match, insert in zip(matches, inserts)
                ])
                header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \
                        (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib)
                # reverse complement if strand is reverse
                if strand == '-':
                    S = rc(['', S])[1]
                yield [header, S]
Example #4
0
def sam2fastq(sam, singles = False, force = False):
    """
    convert sam to fastq
    """
    L, R = None, None
    for line in sam:
        if line.startswith('@') is True:
            continue
        line = line.strip().split()
        bit = [True if i == '1' else False \
                for i in bin(int(line[1])).split('b')[1][::-1]]
        while len(bit) < 8:
            bit.append(False)
        bit = bit[0:8]
        pair, proper, na, nap, rev, mrev, left, right = bit
        # make sure read is paired
        if pair is False:
            if singles is True:
                print_single(line, rev)
            continue
        # check if sequence is reverse-complemented
        if rev is True:
            seq = rc(['', line[9]])[1]
            qual = line[10][::-1]
        else:
            seq = line[9]
            qual = line[10]
        # check if read is forward or reverse, return when both have been found
        if left is True:
            if L is not None and force is False:
                print('sam file is not sorted', file = sys.stderr)
                print('\te.g.: %s' % (line[0]), file = sys.stderr)
                exit()
            if L is not None:
                L = None
                continue
            L = ['@%s' % line[0], seq, '+%s' % line[0], qual]
            if R is not None:
                yield L
                yield R
                L, R = None, None
        if right is True:
            if R is not None and force is False:
                print('sam file is not sorted', file = sys.stderr)
                print('\te.g.: %s' % (line[0]), file = sys.stderr)
                exit()
            if R is not None:
                R = None
                continue
            R = ['@%s' % line[0], seq, '+%s' % line[0], qual]
            if L is not None:
                yield L
                yield R
                L, R = None, None
Example #5
0
def print_single(line, rev):
    """
    print single reads to stderr
    """
    if rev is True:
        seq = rc(['', line[9]])[1]
        qual = line[10][::-1]
    else:
        seq = line[9]
        qual = line[10]
    fq = ['@%s' % line[0], seq, '+%s' % line[0], qual]
    print('\n'.join(fq), file=sys.stderr)
Example #6
0
def patch_end(orig, cov, error, n_error, patches, merged, k, origA, origB, start, stop, buffer):
    """
    use re-assembled fragments to extend scaffold from end
    """
    attempts = []
    for patch in patches:
        patch = patch[1].upper()
        i = patch.find(start)
        if i == -1:
            patch = rc(['', patch])[1]
            i = patch.find(start)
        if i != -1:
            patch = patch[(i + k + buffer):]
            attempts.append([len(patch), patch])
    if len(attempts) > 0:
        best = sorted(attempts, key = lambda x: x[0], reverse = True)[0][1]
        return best
    else: # what if you could not extend the scaffold?
        return False
Example #7
0
def patch_start(orig, cov, error, n_error, patches, k, cov_thresh, buffer):
    """
    use re-assembled fragments to extend scaffold from start
    """
    length = 0
    trimmed = False
    start = []
    for i, c in enumerate(cov):
        if length < k + buffer:
            if check_cov(c, cov_thresh) is False:
                start = []
            else:
                start.append(orig[i])
                length = len(start)
        else:
            start = ''.join(start).upper()
            if n_error is False:
                trimmed = orig[(i - k - buffer):]
            else:
                trimmed = orig[(i - k - buffer):n_error]
            break
    if trimmed is False:  # no region of original sequence passed cov. threshold
        return False
    attempts = []
    for patch in patches:
        patch = patch[1].upper()
        i = patch.find(start)
        if i == -1:
            patch = rc(['', patch])[1]
            i = patch.find(start)
        if i != -1:
            patch = patch[0:i]
            if len(patch) > 0:
                attempts.append([len(patch), patch])
    if len(attempts) > 0:
        best = sorted(attempts, key=lambda x: x[0], reverse=True)[0][1]
        return [[error, 'e', best], [error, 'o', trimmed]]
    else:  # what if you could not extend the scaffold?
        return [[error, 'o', trimmed]]