def fasta(self): """The fasta file containing the filtered genes of this cluster The names now will correspond to long descriptive names""" fasta = FASTA(self.p.fasta) if not fasta: fasta.create() for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name) fasta.close() return fasta
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print "Success", directory
def test(self): """Search one sequence, and see if it works.""" # New directory # directory = new_temp_dir() # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) # seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG TTTAATTACAGACCTGAA""" seq = seq.replace('\n','') seq = seq.replace(' ','') # Make input # input_fasta = FASTA(directory + 'input.fasta') input_fasta.create() input_fasta.add_str(seq, "My test sequence") input_fasta.close() # Make output # out_path = directory + 'output.blast' # Make extras parameters # params = {'-outfmt': 0, '-evalue': 1e-5, '-perc_identity': 99} # Make the search # search = SeqSearch(input_fasta, self.blast_db, 'nucl', 'blast', num_threads = 1, out_path = out_path, params = params) # Run it # search.run() # Print result # print("Success", directory)
faas_genes = [strip(seq) for seq in faa] fnas_genes = [strip(seq) for seq in fna] print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies" #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)] #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)] #print "" fnas_genes = [strip(seq) for fna in fnas for seq in fna] print len(fnas_genes), len(set(fnas_genes)) for genome in faas: out_path = genomes_dir + genome.short_prefix + '.fasta' out_fasta = FASTA(out_path) out_fasta.create() for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq)) out_fasta.close() out_fasta.gzip_to() out_fasta.remove() def lines(): for genome in faas: for gene in genome: name = strip(gene) yield name + '\t' + gene.description[len(name):].rstrip( ' |') + '\n' annotations_path = current_dir + '../ld12/data/annotations.tsv' with open(annotations_path, 'w') as handle:
seq = seq.split('[')[0] return seq for faa,fna in zip(faas, fnas): faas_genes = [strip(seq) for seq in faa] fnas_genes = [strip(seq) for seq in fna] print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies" #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)] #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)] #print "" fnas_genes = [strip(seq) for fna in fnas for seq in fna] print len(fnas_genes), len(set(fnas_genes)) for genome in faas: out_path = genomes_dir + genome.short_prefix + '.fasta' out_fasta = FASTA(out_path) out_fasta.create() for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq)) out_fasta.close() out_fasta.gzip_to() out_fasta.remove() def lines(): for genome in faas: for gene in genome: name = strip(gene) yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n' annotations_path = current_dir + '../ld12/data/annotations.tsv' with open(annotations_path, 'w') as handle: handle.writelines(lines())
class Foraminifera(Database): """This is a custom database containing exlcusively Foraminifera sequences. https://genev.unige.ch/research/laboratory/Jan-Pawlowski You should place the file "foram_db_cor.fasta" in: ~/databases/foraminifera/ Then you can run this: from seqsearch.databases.foraminifera import foraminifera foraminifera.process() print foraminifera.tax_depth_freq """ short_name = "foraminifera" long_name = 'The custom made Foraminifera database as received by email on 7th April 2017' all_paths = """ /foram_db_cor.fasta /foram_mothur.fasta /foram_mothur.tax """ @property def rank_names(self): """The names of the ranks. Total 9 ranks.""" return ['Domain', # 0 'Kingdom', # 1 'Phylum', # 2 'Class', # 3 'Order', # 4 'Family', # 5 'Tribe', # 6 'Genus', # 7 'Species'] # 8 def __init__(self, base_dir=None): # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # The results # self.alignment = FASTA(self.p.mothur_fasta) self.taxonomy = FilePath(self.p.mothur_tax) # The part that mothur will use for naming files # self.nickname = "foram_mothur" def process(self): # The file that was received by email without documentation T_T # raw = FASTA(self.p.cor) # Open files # self.alignment.create() self.taxonomy.create() # Loop # for seq in raw: # Parse # name = seq.id[11:].split('|') num = name.pop(0) # Check # for x in name: assert ';' not in x for x in name: assert '\t' not in x # Make ranks # ranks = ['Eukaryota' , # 0 Domain 'Rhizaria' , # 1 Kingdom 'Foraminifera' , # 2 Phylum name[0] , # 3 Class name[1] , # 4 Order name[2] , # 5 Family name[3] , # 6 Tribe name[4] , # 7 Genus name[5]] # 8 Species # The taxonomy string # tax_line = ';'.join(ranks) # Add sequence to the new fasta file # self.alignment.add_str(str(seq.seq), name="foram" + num) # Add the taxonomy to the tax file # self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n') # Close files # self.alignment.close() self.taxonomy.close()