Example #1
0
def combine_with_hits(clean, s_db, search_out, hits):
    """
    combine sequences with best hits from search
    """
    best = set([hit[1].split()[0] for hit in numblast(open(search_out), hits, False, False)])
    combo = '%s.best%srefs.fa' % (clean.rsplit('.', 1)[0], hits)
    if os.path.exists(combo) is True:
        return combo
    combo = open(combo, 'w')
    for seq in parse_fasta(clean):
        print >> combo, '\n'.join(seq)
    # create/open tch for search database
    s_tch = '%s.tch' % (s_db)
    if os.path.exists(s_tch) is False:
        fasta2tch(s_db)
    id2seq = hash.Hash()
    id2seq.open(s_tch)
    # get sequences for best hits from tch
    for hit in best:
        seq = id2seq[hit].split('\n')
        header = remove_char(seq[0].split()[0]).replace('>', '>best-hit_')
        print >> combo, '\n'.join([header, seq[1].upper()])
    combo.close()
    id2seq.close()
    return combo.name
Example #2
0
def find_16S(fastas,
             hmms,
             bit_thresh=float(20),
             length_thresh=500,
             masking=True,
             buffer=0):
    """
    1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!')
        seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 
    2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches
        seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...]
    3) identify regions that match to 16S (for best model)
    4) mask internal regions that do not align to model
    5) length threshold applies to aligned regions of 16S sequence
    5) export 16S sequnece based on complete gene (including masked insertions)
    """
    # identify start/stop positions
    # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps]
    group2hmm = find_coordinates(hmms, bit_thresh)
    # get sequences from fasta file
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            id = seq[0].split('>')[1].split()[0]
            if id not in group2hmm:
                continue
            seq[1] = seq[1].upper()
            count = 0  # how many 16S genes are there on the contig?
            for group, info in list(group2hmm[id].items()):
                model, strand, coords, matches, gaps = info
                # count insertion bases (ib) from gaps
                ib = sum([i[1] - i[0] + 1 for i in gaps])
                # calcualte length of non-insertion regions (don't include buffer)
                tl = coords[1] - coords[0] + 1
                length = tl - ib
                if length < length_thresh:
                    continue
                # count sequence
                count += 1
                # set retrieval coords based on buffer
                ret_coords = [max([coords[0] - buffer, 1]), \
                        min([coords[1] + buffer, len(seq[1])]), coords[2]]
                buffer_ends = check_buffer(coords, len(seq[1]), buffer)
                # mask insertion sequences
                if masking is True:
                    seq[1] = mask_sequence(seq[1], gaps)
                S = seq[1][(ret_coords[0] - 1):(ret_coords[1])]
                inserts = [gap[1] - gap[0] + 1 for gap in gaps]
                inserts.append('end')
                model_pos = ';'.join([
                    '%s-%s(%s)' % (match[2], match[3], insert)
                    for match, insert in zip(matches, inserts)
                ])
                header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \
                        (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib)
                # reverse complement if strand is reverse
                if strand == '-':
                    S = rc(['', S])[1]
                yield [header, S]
Example #3
0
def headerid2desc(fasta, subset=False):
    db = {}
    for seq in parse_fasta(fasta):
        header = seq[0].split('>')[1]
        id, desc = header.split()[0], '%s len:%s' % (header, len(seq[1]))
        db[id] = desc
        if subset == False:
            db[id] = desc
        elif id in subset:
            db[id] = desc
    return db
Example #4
0
def strip_masked(fasta, min_len, print_masked):
    """
    remove masked regions from fasta file as long as
    they are longer than min_len
    """
    for seq in parse_fasta(fasta):
        nm, masked = parse_masked(seq, min_len)
        nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)]
        yield [0, nm]
        if print_masked is True:
            for i, m in enumerate([i for i in masked if i != []], 1):
                m = ['%s insertion:%s' % (seq[0], i), ''.join(m)]
                yield [1, m]
Example #5
0
def check_type(fasta):
    nucl = ['A', 'T', 'G', 'C']
    junk = ['N', 'U', '.', '-', ' ']
    type = 'nucl'
    for seq in parse_fasta(fasta):
        seq = seq[1].upper()
        for residue in seq:
            if residue in junk:
                continue
            if residue not in nucl:
                type = 'prot'
            break
        break
    return type
Example #6
0
def ko2kegg(file, option, file_type):
	tch = option2kegg(option)
	kegg = hash.Hash()
	kegg.open(tch)
	if file_type == 'fasta':
		for sequence in parse_fasta(file):
			header = sequence[0].split('>')[1]
			id = header.split()[0]
			yield header
			ks = set(find_ko(header.split()))
			for k in ks:
				if k in kegg:
					for function in kegg[k].split('|'):
						# - id - k - function
						yield '\t%s\t%s\t%s' % (id, k, function)
				else:
						yield '\t%s\t%s\tn/a' % (id, k)

	elif file_type == 'list':
		for line in file:
			line = line.strip()
			if len(line.split()) != 0:
				id = line.split()[0]
				yield line
				ks = set(find_ko(line.split()))
				for k in ks:
					if k in kegg:
						for function in kegg[k].split('|'):
							# - id - k - function
							yield '\t%s\t%s\t%s' % (id, k, function)
					else:
						yield '\t%s\t%s\tn/a' % (id, k)

	else:
		ks = set(find_ko(file))
		for k in ks:
			if k in kegg:
				for function in kegg[k].split('|'):
					yield [k, function]
			else:
				yield [k, 'n/a']
	kegg.close()
Example #7
0
def de_rep(fastas, append_index, return_original = False):
    """
    de-replicate fastas based on sequence names
    """
    ids = []
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            header = seq[0].split('>')[1].split()
            id = header[0]
            if id not in ids:
                ids.append(id)
                if return_original is True:
                    yield [header, seq]
                else:
                    yield seq
            elif append_index == True:
                new, ids = append_index_id(id, ids) 
                if return_original is True:
                    yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]]
                else:
                    yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]