def rename_mfannot_proteome(fastas, ids_file, column_id=-1, column_name=-3, headermode=1): ids = [] names = {} with open(ids_file, 'r') as idsinput: for line in idsinput.readlines(): id = line.strip().split(',')[column_id].strip() name = line.strip().split(',')[column_name].strip() if name == "": name = "UNKNOWN" names[id] = name for fas in fastas: fasta_stream = fasta.parse(fas) # update fasta headers with new names fasta_stream, spec_name, spec_id = fasta.set_header(fasta_stream, names=names, mode=headermode) new_filename = spec_name + "_" + spec_id + ".fasta" new_filename = new_filename.replace("/", "") with open(new_filename, "w") as out: print("Generating", new_filename) for head, seq in fasta_stream.items(): out.write(">" + head + "\n" + seq + "\n")
def split_clusters(fastas, clusters, ids=None, filter=0.1, with_treshold=False): clusts = load_clusters(clusters) all_fastas = {} print("Number of species fasta files =", len(fastas), "\nNumber of clusters =", len(clusts)) #max_key, max_value = max(clusts.items(), key = lambda x: len(set(x[1]))) if ids: # add a dict with IDS-name corresp ids_names = add_name_from_nc_id(ids) for fas in fastas: fasta_stream = fasta.parse(fas) all_fastas.update(fasta_stream) for i, clust in enumerate(clusts): clustname = "OG" + format(int(clust.split("_")[-1]), '06d') print("[", round(i / len(clusts) * 100), "% ]", "Cluster", clustname, "size =", len(clusts[clust])) if (with_treshold and not len(clusts[clust]) > len(fastas) - len(fastas) * 0.1): pass else: #print("Keeping", clustname, len(clusts[clust]), ">", len(fastas)-len(fastas)*filter) with open(clustname + '.fa', 'w') as clust_out: for key in clusts[clust]: for fasta_key in all_fastas.keys(): if key in fasta_key: clust_out.write(">" + fasta_key + "\n" + all_fastas[fasta_key] + "\n")
def detect_bad_genomes(fastas): duplicates = [] species_list = [] genes = {} for fas in fastas: fasta_stream = fasta.parse(fas) for gene in list(fasta_stream.keys()): #print(gene.split('_')[0], gene.split('_')[1]) gene_name = gene.split('_')[0] if len(gene.split('_')[1]) == 1: species_id = gene.split('_')[2] else: species_id = gene.split('_')[1] print(species_id, gene.split('_')) if gene_name in genes.keys(): genes[gene_name].append(species_id) else: genes[gene_name] = [species_id] for gene in genes: for species in genes[gene]: if species not in species_list: species_list.append(species) if genes[gene].count(species) > 1: #print(gene, species) #if gene in ['cox1', 'cox2', 'cob']: if species not in duplicates: duplicates.append(species) print(duplicates, len(duplicates), len(species_list)) for species in species_list: if species not in duplicates: #print(species) continue
def main(): args = [] for line in fileinput.input(): args.append(line.rstrip()) fastas = fasta.parse(args) profile_matrix = profile.profile_matrix(fastas) print consensus_string(profile_matrix) profile.pretty_print(profile_matrix)
def main(): args = [] for line in fileinput.input(): args.append(line.rstrip()) fastas = fasta.parse(args) # get the highest gc percentage fasta and print it pretty as a percentage sorted_gc = sorted(gc_content(fastas).items(), key=lambda t: -t[1]) print sorted_gc[0][0] + ' ' + str(sorted_gc[0][1] * 100)
def get_input(self): '''Open a single .seq, .fasta, .fastq, .scf or .ab1 file and set variables accordingly.''' parts = self.filepath.split('/') filename = parts.pop() #get filename self.name = filename path = '/'.join(parts) #path to file #establish type of input file if '.' in filename: self.input_type = filename.split('.').upper() else: self.input_type = None #establish orientation of DNA if filename[-2:].upper() == 'FW': self.SetOrientation('fw') elif filename[-2:].upper() == 'RV': self.SetOrientation('rv') else: raise TypeError, 'The last two characters of the filename (before the .) must specify whether the sequence is fw or rv. Pleace rename file %s accordingly' % filename #read the input if self.input_type in ['TXT', 'SEQ', None]: f = open(filepath, 'r') dna = f.read() self.SetDNA(dna.replace('\n', '')) #add an assert that there are only dna bases here f.close() elif self.input_type in ['AB1', 'ABI']): ab1 = ABIreader.Trace(filepath, trimming=False) #optionally ', trimming=True' self.SetDNA(ab1.seq) self.SetQualVal(ab1.qual_val) self.SetTrace([AB1Trace.data['raw1'], AB1Trace.data['raw2'], AB1Trace.data['raw3'], AB1Trace.data['raw4']]) #abi=dict(baseorder=ab1.data['baseorder'], qual_val=ab1.qual_val, G=str(AB1Trace.data['raw1']), A=str(AB1Trace.data['raw2']), T=str(AB1Trace.data['raw3']), C=str(AB1Trace.data['raw4'])) elif self.input_type == 'ABIF': print('Support for .abif files has not yet been implemented') elif self.input_type == 'ZTR': print('Support for .ztr files has not yet been implemented') elif self.input_type == 'SCF': print('Support for .scf files has not yet been implemented') elif fnmatch.fnmatch(filename, '*.fasta'): id, dna = fasta.parse(self.filepath) #parse the fasta file. File should contain ONE entry self.SetDNA(dna) elif fnmatch.fnmatch(filename, '*.fastq'): id, dna, id2, qual_val = fastq.parse(self.filepath) #parse the fastq file. File should contain ONE entry self.SetDNA(dna) self.SetQualVal(qual_val) else: raise TypeError, '"%s" is not a .txt, .seq, .scf, .fasta, .fastq, .abif, .abi or .ztr file' % filename
def main(): args = [] for line in fileinput.input(): args.append(line.rstrip()) fastas = fasta.parse(args) strings = [] for f in fastas: strings.append(f.content) first = strings[0] intersect_set = permutation_set(first) for s in strings[1:]: intersect_set = intersect_set & permutation_set(s) print max(intersect_set, key=len)
def list_common_genes(fastas, gene_pos=0, id_position=-1, delimiter="~"): genes = {} all_fastas = {} for k, fas in enumerate(fastas): fasta_stream = fasta.parse(fas) all_fastas.update(fasta_stream) for head, seq in fasta_stream.items(): gene_head = head.split(delimiter) if gene_head[gene_pos] not in genes.keys(): genes[gene_head[gene_pos]] = [] genes[gene_head[gene_pos]].append(head) #print(genes.keys()) for gene, entry in genes.items(): if len(entry) == min([len(entry) for entry in genes.values()]): #print(gene) smallest_subset = [] for ent in sorted(entry): #print(ent) smallest_subset.append(ent.split(delimiter)[id_position]) kept_genes = {} kept_species = {} for gene, entry in genes.items(): for ent in entry: if ent.split(delimiter)[id_position] in smallest_subset: if gene not in kept_genes.keys(): kept_genes[gene] = [] if ent not in kept_species.keys(): kept_species[ent] = [] kept_genes[gene].append(ent) kept_species[ent].append(gene) #print(all_fastas) for gene, heads in kept_genes.items(): #print(head, all_fastas[head]) with open(gene + ".fasta", "w") as output: for header in heads: #all_fastas[header] output.write(">" + header + "\n" + all_fastas[header] + "\n") # concat max_len = 70 concat = "" for head in kept_species: concat += all_fastas[head] chunks = [ concat[i:i + max_len] for i in range(0, len(concat), max_len) ] with open(head.split(delimiter)[-1] + ".concat.fasta", "w") as output: output.write(">" + delimiter.join(head.split(delimiter)[-2:]) + "\n") for elem in chunks: output.write(elem + "\n")
def remove_spurious_clusters(fastas, clusters, ids=None): clusts = load_clusters(clusters) max_key, max_value = max(clusts.items(), key=lambda x: len(set(x[1]))) if ids: # add a dict with IDS-name corresp ids_names = add_name_from_nc_id(ids) for fas in fastas: fasta_stream = fasta.parse(fas) with open(fas + '.fasta', 'w') as fas_out: for key in fasta_stream: print('_'.join(key.split('_')[-3:-1])) if key in max_value: fas_out.write(">" + key + '\n') fas_out.write(fasta_stream[key] + '\n')
def ortho_intersect(fastas): group = {} all_fastas = {} # parse fastas and build all groups for k, fas in enumerate(fastas): fasta_stream = fasta.parse(fas) all_fastas.update(fasta_stream) group[k] = [] for key in fasta_stream.keys(): if len(key.split("~")) != 3: new_head = [ key.split("_")[0], "_".join(key.split("_")[1].split('-')), '_'.join(key.split("_")[2:]) ] new_head = '~'.join(new_head) group[k].append(new_head) else: group[k].append(key) new_group = {} for grp_a in group: new_group[grp_a] = [] for grp_b in group: if grp_b == grp_a: break for key_a in group[grp_a]: if key_a in group[grp_b]: new_group[grp_a].append(key_a) print(round(grp_a / len(group) * 100), "%") print("-----COMP-----") # for k in new_group: # print(len(new_group[k])) for i, k in enumerate( sorted(new_group, key=lambda k: len(new_group[k]), reverse=True)): if i > 20: break print(k, len(new_group[k])) with open("GROUP_" + str(i), "w") as output: for entry in new_group[k]: ######### # if entry.startswith("orf"): # continue # else: # output.write(">"+entry+"\n"+all_fastas[entry]+"\n") ######### output.write(">" + entry + "\n" + all_fastas[entry] + "\n")
def subset_seq_from_tax(keyword, taxonomy, fastas): """ input: ----- keyword (str): sordariales fastas (list): a list of fasta files to analyse Called using -selectgroup flag : mtanalysis -selectgroup "sordariales" taxonomy.csv cox1.fasta This will select only sequences with a specific property from taxonomy. The Sequences and taxonomy are link using a corresponding CSV file, linking ID to Taxonomy. """ try: keywds = keyword.split(";") if len(keywds) > 1: print(keywds) except: keywds = keyword taxonomy_retain = [] with open(taxonomy, 'r') as taxo_f: for line in taxo_f.readlines(): if type(keywds) == list: for keyword in keywds: if keyword.lower() in line.lower(): #print(keyword) retained = line.split(",")[-1] taxonomy_retain.append(retained.strip()) else: if keyword.lower() in line.lower(): #print(keyword) retained = line.split(",")[-1] taxonomy_retain.append(retained.strip()) print(taxonomy_retain) for k, fas in enumerate(fastas): fasta_stream = fasta.parse(fas) with open(fas + ".ret", "w") as output: for key, seq in fasta_stream.items(): if key.split("~")[-1] in taxonomy_retain: ret = ">" + key + "\n" + seq + "\n" output.write(ret)
def rename_from_id(ids_file, fastas, delimiter="~", id_indice=-1, species_name_pos=-3): names = {} with open(ids_file, "r") as idsinput: for line in idsinput.readlines(): line = line.strip() id = line.split(",")[id_indice].strip() names[id] = line.split(",")[species_name_pos].strip() for k, fas in enumerate(fastas): fasta_stream = fasta.parse(fas) with open(fas, "w") as output: for key, seq in fasta_stream.items(): head_lst = key.split(delimiter) id = head_lst[id_indice] #gene = head_lst[0] #species_name = head_lst[1] head_lst[1] = "_".join(names[id].split(" ")) head = delimiter.join(head_lst) ret = ">" + head + "\n" + seq + "\n" output.write(ret)
parser.add_argument('Reads', type=str, nargs='?', default='-', help='Reference file or - for stdin') parser.add_argument('-o', dest='output', action='store', default='-', help='Output fasta or - for stdout') parser.add_argument('--rev', action='store_true', help='Consider reverse strip') args = parser.parse_args() if args.Reads == '-': f = sys.stdin else: f = open(args.Reads, 'rb') if args.output == '-': g = sys.stdout else: g = open(args.output, 'wb') F = [s for _, s in parse(f)] if args.rev: # TODO: Parse reverse strip fragments pass for edge in multi_graph(F): g.write("%s\n" % " ".join([str(i) for i in edge])) if args.Reads != '-': f.close() if args.output != '-': g.close()
# http://rosalind.info/problems/grph/ import fasta def isConnected(s, t, k): """ Returns true if string s has a suffix of length k equal to the prefix of t of length k. """ return s[-k:] == t[:k] entries = list(fasta.parse()) edges = ((a, b) for i, a in enumerate(entries) for j, b in enumerate(entries) if a != b and isConnected(a[1], b[1], 3)) for v1, v2 in edges: print v1[0], v2[0]
def build_dna_table(): f = open("table.txt").read().split() return dict(zip(f[0::2], f[1::2])) def dna_to_proteins(dna, dna_table): codons = [dna[i : i+3] for i in xrange(0, len(dna) - 2, 3)] for i, codon in enumerate(codons): if codon == "ATG": protein = ["M"] for codon2 in codons[i+1:]: if codon2 in ("TAA", "TAG", "TGA"): yield ''.join(protein) break else: protein.append(dna_table[codon2]) dna_table = build_dna_table() dna = next(fasta.parse())[1] complement = dna[::-1].translate(string.maketrans("ATCG", "TAGC")) reading_frames = [dna[i:] for i in xrange(3)] + [complement[i:] for i in xrange(3)] proteins = set() for reading_frame in reading_frames: for protein in dna_to_proteins(reading_frame, dna_table): if protein: proteins.add(protein) print "\n".join(proteins)
# http://rosalind.info/problems/lcsm/ import fasta, collections strings = [s for desc, s in fasta.parse()] subs = strings[0] found = [] for i in xrange(len(subs)): indexes = collections.defaultdict(int) for j in xrange(1, len(subs) - i + 1): substr = subs[i : i+j] common = True for s in strings: lastFound = indexes[s] index = s.find(substr, lastFound) if index == -1: common = False break else: indexes[s] = index if common: found.append(substr) else: break print max(found, key=len)
# http://rosalind.info/problems/gc/ import fasta def gcContent(s): return float(sum(1 for c in s if c == 'G' or c == 'C')) / len(s) entries = fasta.parse() contents = ((d, gcContent(s)) for d, s in entries) desc, maxGcContent = max(contents, key=lambda x: x[1]) print desc print maxGcContent * 100
type=int, help='Maximum read length') parser.add_argument('--coverage', type=float, action='store', default=11, help='Desired coverage of given RefSeq') parser.add_argument('--error', type=float, action='store', default=0, help='Error rate, in %') parser.add_argument('--rev', type=float, action='store', default=0, help='Reverse strip rate, in %') args = parser.parse_args() if args.RefSeq == '-': f = sys.stdin else: f = open(args.RefSeq, 'rb') if args.output == '-': g = sys.stdout else: g = open(args.output, 'wb') seqs = [s for _, s in parse(f)] for i, fragment in enumerate(split_reads(seqs, args.min_size, args.max_size, args.coverage, args.error, args.rev)): write(fragment, 'fragment_%d' % i, g) if args.RefSeq != '-': f.close() if args.output != '-': g.close()
# http://rosalind.info/problems/lcsm/ import fasta, collections strings = [s for desc, s in fasta.parse()] subs, longest = strings[0], "" for i in xrange(len(subs)): indexes = collections.defaultdict(int) for j in xrange(1, len(subs) - i + 1): substr = subs[i : i+j] if all(substr in s for s in strings): longest = max(substr, longest, key=len) else: break print longest