def scan_sequences(self, seq_file, threshold=0., only_best=False): """ Apply the TFFM on the fasta sequences and return the TFBS hits. :arg seq_file: Fasta file giving the DNA sequences to apply the TFFM on. :type seq_file: str :arg threshold: The threshold used to predict a hit (i.e. the minimal probability value for a position to be considered a TFBS hit) (default: 0.0). :type threshold: float :arg only_best: Argument to be set to :class:`True` if only the best TFBS hit per sequence is to be reported (default: :class:`False`) :type only_best: bool :returns: TFBS hits through a generator. :rtype: :class:`Generator` of :class:`HIT` :note: (**0.0<=** *threshold* **<=1.0**) """ sequence_list = utils.parse_fasta(seq_file) for seq_record in sequence_list: hits = self.scan_sequence(seq_record, threshold, only_best) for hit in hits: yield hit
def pocc_sequences(self, seq_file, threshold=0.): """ Apply the TFFM on the fasta sequences and return the Pocc value (probability of occupancy) for each sequence. :arg seq_file: Fasta file giving the DNA sequences to apply the TFFM on. :type seq_file: str :arg threshold: The threshold used to predict hits that will be used to compute the Pocc (default: 0.0). :type threshold: float :returns: Pocc values through a generator. :rtype: :class:`Generator` of :class:`HIT` :note: (**0.0<=** *threshold* **<=1.0**) """ sequence_list = utils.parse_fasta(seq_file) for seq_record in sequence_list: pocc = 1. for hit in self.scan_sequence(seq_record, threshold, False): if hit: pocc *= (1. - hit.score) yield hit_module.HIT(seq_record, 1, len(seq_record), None, pocc, self, None)
def retrieve_taxonomy(prefix): Entrez.email = "*****@*****.**" genbankACCRegex = re.compile(r'([A-Z]{2}[0-9]{8}|[A-Z]{2}[0-9]{6}|[A-Z][0-9]{5}|NC_[0-9]{6})') accessionIDs = [header for header, sequence in utils.parse_fasta(sequenceFile)] validIDs = list(filter(genbankACCRegex.match, accessionIDs)) invalidIDs = [x for x in accessionIDs if x not in validIDs] while 1: try: handle = Entrez.elink(dbfrom='nuccore', db='taxonomy', id=validIDs, idtype='acc', rettype='xml') record = Entrez.read(handle) handle.close() except RuntimeError: time.sleep(5) continue break taxIDs = { str(x['IdList'][0]).split('.')[0] : x['LinkSetDb'][0]['Link'][0]['Id'] if x['LinkSetDb'] else "XXXXX" for x in record } values = list(set(taxIDs.values())) target_handle = Entrez.efetch(db='taxonomy', id=','.join(values), retmode='xml') target_record = Entrez.read(target_handle) target_handle.close() accID2species = get_clade_by_rank('species', taxIDs, target_record) accID2genus = get_clade_by_rank('genus', taxIDs, target_record) accID2GBdescription = {} nuccore_handle = Entrez.efetch(db='nuccore', id=','.join(list(accID2species.keys())), rettype='gb', retmode='xml') nuccore_records = Entrez.read(nuccore_handle) for record in nuccore_records: country = collectionDate = '--' accID = record['GBSeq_primary-accession'].split('.')[0] featureTable = record['GBSeq_feature-table'][0]['GBFeature_quals'] for feature in featureTable: if feature['GBQualifier_name'] == 'country': country = feature['GBQualifier_value'] if feature['GBQualifier_name'] == 'collection_date': collectionDate = feature['GBQualifier_value'] accID2GBdescription[accID] = f"{accID2species[accID]},{accID2genus[accID]},{record['GBSeq_definition'].replace(' ','_')},{country.replace(' ','_')},{collectionDate.replace(' ','_')}" while len(accID2GBdescription) != len(accID2species): remainder = [acc for acc in accID2species if acc not in accID2GBdescription] time.sleep(1) nuccore_handle = Entrez.efetch(db='nuccore', id=','.join(remainder), rettype='gb', retmode='xml') nuccore_records = Entrez.read(nuccore_handle) for record in nuccore_records: country = collectionDate = '--' accID = record['GBSeq_primary-accession'].split('.')[0] featureTable = record['GBSeq_feature-table'][0]['GBFeature_quals'] for feature in featureTable: if feature['GBQualifier_name'] == 'country': country = feature['GBQualifier_value'] if feature['GBQualifier_name'] == 'collection_date': collectionDate = feature['GBQualifier_value'] accID2GBdescription[accID] = f"{accID2species[accID]},{accID2genus[accID]},{record['GBSeq_definition'].replace(' ','_')},{country.replace(' ','_')},{collectionDate.replace(' ','_')}" return accID2GBdescription
def split_assembled_genome(gtf_path, fasta_path, od='.', L=98): trs = parse_gtf(gtf_path) print('GTF parsed') scaffolds = parse_fasta(fasta_path) print('Scaffolds parsed') wrong_scaffolds = 0 for tr, data in trs.items(): processed = [] unprocessed = [] if data['scaffold'] in scaffolds: sequence = scaffolds[data['scaffold']] else: print(f'{data["scaffold"]} not in FASTA file {fasta_path}') wrong_scaffolds += 1 continue for exon in data['exons']: processed.append((int(exon['start']) - 1, int(exon['end']) - 1)) unprocessed.append( (int(exon['start']) - L, int(exon['end']) + L - 2)) processed = list(merge_intervals(processed)) unprocessed = merge_intervals(unprocessed) processed = ''.join(map(lambda iv: sequence[iv[0]:iv[1]], processed)) splice_junctions = [] for iv in unprocessed: if iv[1] - iv[0] < 3 * L - 3: # If the length of the exon is < L-1 splice_junctions.append(iv) else: splice_junctions.append((iv[0], iv[0] + 2 * L - 2)) splice_junctions.append((iv[1] - (2 * L) + 2, iv[1])) splice_junctions = [ collapse_N(sequence[iv[0]:iv[1]].upper()) for iv in splice_junctions ] processed = collapse_N(processed).upper() if data['strand'] == '-': processed = reverse_complement(processed) splice_junctions = map(reverse_complement, splice_junctions) with open(f'{od}/processed_transcripts.fasta', 'a') as fh: fh.write(f'>{tr}\n') fh.write('\n'.join( [processed[i:i + 80] for i in range(0, len(processed), 80)])) fh.write('\n') with open(f'{od}/splice_junctions.fasta', 'a') as fh: for i, sj in enumerate(splice_junctions): fh.write(f'>{tr}:{i}\n') fh.write(f'{sj}\n') print('DONE!') print( f'{wrong_scaffolds} scaffolds were not found, and the corresponding annotations were ignored.' )
def calculate_gc_percentage(data): gene_dict = parse_fasta(data) gc_dict = {} for k, v in gene_dict.items(): gc_dict[k] = gc_percentage(v) max_key = max(gc_dict, key=lambda k: gc_dict[k]) return ''.join(max_key + '\n' + str(gc_dict[max_key]))
def overlap_graph(data): gene_dict = parse_fasta(data) tup_list = [] for k, v in gene_dict.items(): for kj, kv in gene_dict.items(): if v[-3:] == kv[:3] and k != kj: tup_list.append((k, kj)) result = '' for x in tup_list: result += '{0} {1}\n'.format(x[0], x[1]) return result
def reseed_chrom(organism, accession) : url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=" + accession + "&rettype=fasta&retmode=text" resource = urllib.request.urlopen(url) content = resource.read().decode("utf-8") fasta = parse_fasta(content) spec = { 'scientific name' : organism} taxon = tax_collect.find_one(spec) taxon_id = "" if taxon : taxon_id = taxon['id'] builder = SequencePageBuilder(seq_collect, organism, taxon_id, "", 10000) builder.purge(accession) builder.process(accession, fasta['body'], False)
def get_fasta_seqs(ids): """Get fasta sequences from protein IDs on uniprot. Args: ids: list of protein IDs Return: a dict of fasta sequences, with key: protein id and value: sequence """ url_template = "http://www.uniprot.org/uniprot/{}.fasta" fasta_seqs = {} for id in ids: r = requests.get(url_template.format(id)) data = r.text.split("\n")[:-1] fasta_sequence = parse_fasta(data) fasta_seqs[id] = next(iter(fasta_sequence.values())) return fasta_seqs
def consensus_and_profile(data): gene_dict = parse_fasta(data) for v in gene_dict.values(): length = len(v) break profile = { 'A': [0] * length, 'T': [0] * length, 'C': [0] * length, 'G': [0] * length } for v in gene_dict.values(): for i in range(len(v)): if v[i] == 'A': profile['A'][i] += 1 if v[i] == 'T': profile['T'][i] += 1 if v[i] == 'G': profile['G'][i] += 1 if v[i] == 'C': profile['C'][i] += 1 profile_str = 'A: %s\nC: %s\nG: %s\nT: %s\n' % \ (' '.join(str(x) for x in profile['A']), ' '.join(str(x) for x in profile['C']), ' '.join(str(x) for x in profile['G']), ' '.join(str(x) for x in profile['T'])) consensus = ['A'] * length for i in range(length): consensus[i] = max(profile, key=lambda k: profile[k][i]) return ''.join(consensus) + '\n' + profile_str
outputStream.write(f"{acc},--,--,--,--,--\n") outputStream.write(f"####################\n\n") return (avgClusterPerSpecies, avgClusterPerGenus) ######################################################################### args = docopt(__doc__) treeFile = args['<treeFile>'] seqFile = args['<seqFile>'] clusterFile = args['<clusterFile>'] NCBI = bool(args['--ncbi']) PREFIX = args['--toolName'] allSequences = {header: seq for header, seq in utils.parse_fasta(seqFile)} cluster, centroids, failbob = utils.parse_clusterFile(clusterFile) if not failbob: failbob = [ cluster for idx, cluster in cluster.items() if len(cluster) == 1 ] #realCluster = {idx : cluster for idx,cluster in cluster.items() if len(cluster) != 1 and idx != '-1'} realCluster = { idx: cluster for idx, cluster in cluster.items() if cluster not in failbob } if treeFile: tree = dendropy.Tree.get(path=treeFile, schema='newick') dm = tree.phylogenetic_distance_matrix()
# Author: Kevin Lamkiewicz # Email: [email protected] """ """ import sys from collections import defaultdict import utils inputFile = sys.argv[1] fastaFile = sys.argv[2] if len(sys.argv) == 4: goiFile = sys.argv[3] goiHeader = [header for header, _ in utils.parse_fasta(goiFile)] outputFile = f"{sys.argv[1]}.clstr" clusterInfo = defaultdict(list) fastaContent = {header: seq for header, seq in utils.parse_fasta(fastaFile)} with open(inputFile, 'r') as inputStream: centroids = set() for line in inputStream: currentArray = line.split() centroid = currentArray[0] sequence = currentArray[1] centroids.add(centroid) clusterInfo[centroid].append(sequence)
'TCT': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*', 'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W', } for header, sequence in utils.parse_fasta(sequenceFile): positiveStrand = "" longestCDS = 0 strands = [sequence, utils.reverseComplement(sequence)] for strand in strands: for frame in range(3): proteinSequence = "" for fragment in range(frame, len(strand), 3): codon = strand[fragment:fragment + 3] if len(codon) != 3: continue try: proteinSequence += codon2aminoacid[codon] except KeyError: proteinSequence += 'X' matches = regex_orf.findall(proteinSequence)
def main(): with open(sys.argv[1]) as f: fastas = utils.parse_fasta(f) m = max_gc_content(fastas) print "{}\n{}%".format(m[0], m[1] * 100)
# Author: Kevin Lamkiewicz # Email: [email protected] """ This script mainly exists, because the output of cd-hit-est sucks :) """ import sys import utils inputFile = sys.argv[1] sequenceFile = sys.argv[2] if len(sys.argv) == 4: goiFile = sys.argv[3] goiHeader = [header for header, _ in utils.parse_fasta(goiFile)] originalHeader = sorted( [header for header, _ in utils.parse_fasta(sequenceFile)]) truncatedHeader = [] with open(inputFile, 'r') as inputStream: for line in inputStream: if line.startswith('>'): continue truncatedHeader.append(line.strip().split('>')[1].split(' ')[0]) truncatedHeader = sorted(truncatedHeader) assert (len(truncatedHeader) == len(originalHeader)) headerMapping = {x: y for x, y in zip(truncatedHeader, originalHeader)} assert (len(headerMapping) == len(originalHeader))
type=str, help='File containing the PSSM profile.') group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.') parser.add_argument('filename_model', type=str, help='File containing the GOR model.') args = parser.parse_args() model = np.load(args.filename_model)['model'].item() if args.pssm: profile = parse_pssm(args.pssm) else: profile = seq_to_profile(parse_fasta(args.fasta)) dssp = '' for i in range(0, len(profile)): half_window_size = int((len(model['-']) - 1) / 2) score_H, score_E, score_C = 0, 0, 0 for j in range(max(0, i - half_window_size), min(i + half_window_size + 1, len(profile))): for k in range(0, len(profile[i])): score_H += profile[j][k] * model['H'][j - i + half_window_size][k] score_E += profile[j][k] * model['E'][j - i + half_window_size][k] score_C += profile[j][k] * model['-'][j - i + half_window_size][k]
def test_definition_case(self): fasta = list(utils.parse_fasta(self.SAMPLE_DATASET))[0] self.assertAlmostEqual(gc_content(fasta), .60919540)
""" Args: sequences: dict of sequences Returns: a dict with key: sequence name and value: GC content """ res = {} for name in sequences: sequence = sequences[name] gc_content = (sequence.count("G") + sequence.count("C")) / len(sequence) * 100 res[name] = round(gc_content, 4) return res if __name__ == "__main__": path = "../data/GC.txt" with open(path) as f: data = [s.strip() for s in f.readlines()] sequences = parse_fasta(data) gc_contents = compute_GC(sequences) max_key = max(gc_contents, key=gc_contents.get) print(max_key) print(gc_contents[max_key])
help='File containing the FASTA sequence.') parser.add_argument( '--probs', action='store_true', help='Output class probabilities together with the prediction.') parser.add_argument('filename_model', type=str, help='File containing the model.') args = parser.parse_args() model = pickle.load(open(args.filename_model, 'rb')) if args.pssm: profile = np.array(parse_pssm(args.pssm)) else: profile = np.array(seq_to_profile(parse_fasta(args.fasta))) dssp = '' for i in range(0, len(profile)): half_window_size = int((len(model.support_vectors_[0]) / 20 - 1) / 2) part1 = np.zeros(20 * max(0, half_window_size - i)) part2 = np.ndarray.flatten( profile[max(0, i - half_window_size):min(i + half_window_size + 1, len(profile))]) part3 = np.zeros(20 * max(0, half_window_size - (len(profile) - i - 1))) vec = np.concatenate((part1, part2, part3)) if args.probs: probs = model.predict_proba(vec.reshape(1, -1))[0] dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs])
Returns: a list of indexes of where the motif is within sequence """ motif_index = 0 indexes = [] for i, aa in enumerate(sequence): if aa == motif[motif_index]: indexes.append(i + 1) motif_index += 1 if motif_index == len(motif): break return indexes if __name__ == "__main__": path = "../data/SSEQ.txt" with open(path) as f: data = [s.strip() for s in f.readlines()] fasta_seqs = parse_fasta(data) _, sequence = max(fasta_seqs.items(), key=lambda x: len(x[1])) _, motif = min(fasta_seqs.items(), key=lambda x: len(x[1])) indexes = find_spliced_motif(sequence, motif) print(" ".join(map(str, indexes)))
def test_definition_case(self): fasta_list = utils.parse_fasta(self.SAMPLE_DATASET) label, gcc = max_gc_content(fasta_list) self.assertEquals(label, 'Rosalind_0808') self.assertAlmostEqual(gcc, .60919540)