Python parse_fasta Examples, ctbBio27.fasta.parse_fasta Python Examples

Example #1

0

Show file

def combine_with_hits(clean, s_db, search_out, hits):
    """
    combine sequences with best hits from search
    """
    best = set([hit[1].split()[0] for hit in numblast(open(search_out), hits, False, False)])
    combo = '%s.best%srefs.fa' % (clean.rsplit('.', 1)[0], hits)
    if os.path.exists(combo) is True:
        return combo
    combo = open(combo, 'w')
    for seq in parse_fasta(clean):
        print >> combo, '\n'.join(seq)
    # create/open tch for search database
    s_tch = '%s.tch' % (s_db)
    if os.path.exists(s_tch) is False:
        fasta2tch(s_db)
    id2seq = hash.Hash()
    id2seq.open(s_tch)
    # get sequences for best hits from tch
    for hit in best:
        seq = id2seq[hit].split('\n')
        header = remove_char(seq[0].split()[0]).replace('>', '>best-hit_')
        print >> combo, '\n'.join([header, seq[1].upper()])
    combo.close()
    id2seq.close()
    return combo.name

Example #2

0

Show file

File: ssufromHMM.py Project: liupfskygre/bioscripts27

def find_16S(fastas,
             hmms,
             bit_thresh=float(20),
             length_thresh=500,
             masking=True,
             buffer=0):
    """
    1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!')
        seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 
    2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches
        seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...]
    3) identify regions that match to 16S (for best model)
    4) mask internal regions that do not align to model
    5) length threshold applies to aligned regions of 16S sequence
    5) export 16S sequnece based on complete gene (including masked insertions)
    """
    # identify start/stop positions
    # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps]
    group2hmm = find_coordinates(hmms, bit_thresh)
    # get sequences from fasta file
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            id = seq[0].split('>')[1].split()[0]
            if id not in group2hmm:
                continue
            seq[1] = seq[1].upper()
            count = 0  # how many 16S genes are there on the contig?
            for group, info in list(group2hmm[id].items()):
                model, strand, coords, matches, gaps = info
                # count insertion bases (ib) from gaps
                ib = sum([i[1] - i[0] + 1 for i in gaps])
                # calcualte length of non-insertion regions (don't include buffer)
                tl = coords[1] - coords[0] + 1
                length = tl - ib
                if length < length_thresh:
                    continue
                # count sequence
                count += 1
                # set retrieval coords based on buffer
                ret_coords = [max([coords[0] - buffer, 1]), \
                        min([coords[1] + buffer, len(seq[1])]), coords[2]]
                buffer_ends = check_buffer(coords, len(seq[1]), buffer)
                # mask insertion sequences
                if masking is True:
                    seq[1] = mask_sequence(seq[1], gaps)
                S = seq[1][(ret_coords[0] - 1):(ret_coords[1])]
                inserts = [gap[1] - gap[0] + 1 for gap in gaps]
                inserts.append('end')
                model_pos = ';'.join([
                    '%s-%s(%s)' % (match[2], match[3], insert)
                    for match, insert in zip(matches, inserts)
                ])
                header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \
                        (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib)
                # reverse complement if strand is reverse
                if strand == '-':
                    S = rc(['', S])[1]
                yield [header, S]

Example #3

0

Show file

File: blast2desc.py Project: liupfskygre/bioscripts27

def headerid2desc(fasta, subset=False):
    db = {}
    for seq in parse_fasta(fasta):
        header = seq[0].split('>')[1]
        id, desc = header.split()[0], '%s len:%s' % (header, len(seq[1]))
        db[id] = desc
        if subset == False:
            db[id] = desc
        elif id in subset:
            db[id] = desc
    return db

Example #4

0

Show file

File: strip_masked.py Project: liupfskygre/bioscripts27

def strip_masked(fasta, min_len, print_masked):
    """
    remove masked regions from fasta file as long as
    they are longer than min_len
    """
    for seq in parse_fasta(fasta):
        nm, masked = parse_masked(seq, min_len)
        nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)]
        yield [0, nm]
        if print_masked is True:
            for i, m in enumerate([i for i in masked if i != []], 1):
                m = ['%s insertion:%s' % (seq[0], i), ''.join(m)]
                yield [1, m]

Example #5

0

Show file

File: search.py Project: liupfskygre/bioscripts27

def check_type(fasta):
    nucl = ['A', 'T', 'G', 'C']
    junk = ['N', 'U', '.', '-', ' ']
    type = 'nucl'
    for seq in parse_fasta(fasta):
        seq = seq[1].upper()
        for residue in seq:
            if residue in junk:
                continue
            if residue not in nucl:
                type = 'prot'
            break
        break
    return type

Example #6

0

Show file

def ko2kegg(file, option, file_type):
	tch = option2kegg(option)
	kegg = hash.Hash()
	kegg.open(tch)
	if file_type == 'fasta':
		for sequence in parse_fasta(file):
			header = sequence[0].split('>')[1]
			id = header.split()[0]
			yield header
			ks = set(find_ko(header.split()))
			for k in ks:
				if k in kegg:
					for function in kegg[k].split('|'):
						# - id - k - function
						yield '\t%s\t%s\t%s' % (id, k, function)
				else:
						yield '\t%s\t%s\tn/a' % (id, k)

	elif file_type == 'list':
		for line in file:
			line = line.strip()
			if len(line.split()) != 0:
				id = line.split()[0]
				yield line
				ks = set(find_ko(line.split()))
				for k in ks:
					if k in kegg:
						for function in kegg[k].split('|'):
							# - id - k - function
							yield '\t%s\t%s\t%s' % (id, k, function)
					else:
						yield '\t%s\t%s\tn/a' % (id, k)

	else:
		ks = set(find_ko(file))
		for k in ks:
			if k in kegg:
				for function in kegg[k].split('|'):
					yield [k, function]
			else:
				yield [k, 'n/a']
	kegg.close()

Example #7

0

Show file

def de_rep(fastas, append_index, return_original = False):
    """
    de-replicate fastas based on sequence names
    """
    ids = []
    for fasta in fastas:
        for seq in parse_fasta(fasta):
            header = seq[0].split('>')[1].split()
            id = header[0]
            if id not in ids:
                ids.append(id)
                if return_original is True:
                    yield [header, seq]
                else:
                    yield seq
            elif append_index == True:
                new, ids = append_index_id(id, ids) 
                if return_original is True:
                    yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]]
                else:
                    yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]