def fasta_file(self, fasta_path, scaffold_number): """This method select scaffold's fasta sequence Args: fasta_path ([str]): [fasta's path] scaffold_number ([int]): [scaffold's number] """ self.scaffold_number = scaffold_number self.scaffold_number = str(self.scaffold_number) fasta = ph.read_fasta(fasta_path) if 'scaffold_' + scaffold_number in fasta.id.values: self.dir = 'scaffold_' + scaffold_number + '_info' os.mkdir(self.dir) fasta.drop(columns={'description', 'name'}, inplace=True) to_save = fasta[fasta.id == 'scaffold_' + self.scaffold_number].copy() to_save.id = '>' + to_save.id to_save.to_csv('./' + self.dir + '/scaffold_' + self.scaffold_number + '.fasta', sep='\n', index=False, header=False, quoting=csv.QUOTE_NONE) else: print('Scaffold ' + self.scaffold_number + ' nao existente!') exit()
def test_to_nexus(self, path_to_dat): path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) # Write to csv nexus_path = os.path.join(path_to_dat, 'test.nexus') df.phylo.to_nexus(alphabet='protein', filename=nexus_path) assert os.path.exists(nexus_path)
def test_to_fasta(self, path_to_dat): path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) print(df.phylo) # Write to fasta fasta_path = os.path.join(path_to_dat, 'test.fasta') df.phylo.to_fasta(fasta_path) assert os.path.exists(fasta_path)
def test_to_embl(self, path_to_dat): path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) # Write to csv embl_path = os.path.join(path_to_dat, 'test.embl') df.phylo.to_embl(alphabet='protein', filename=embl_path) assert os.path.exists(embl_path)
def test_to_phylip(self, path_to_dat): path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) # Write to csv phylip_path = os.path.join(path_to_dat, 'test.phylip') df.phylo.to_phylip(phylip_path) assert os.path.exists(phylip_path)
def test_read_fasta(path_to_dat): # Get path path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) # Tests keys = df.keys() assert type(df) == ph.DataFrame assert 'id' in keys assert 'sequence' in keys assert 'description' in keys
def test_to_fasta(path_to_dat): path = os.path.join(path_to_dat, 'PF08793_seed.fasta') df = ph.read_fasta(path) # Extract a single row row = df.iloc[0] # Write row to fasta fasta_path = os.path.join(path_to_dat, 'test.fasta') row.phylo.to_fasta(fasta_path) assert os.path.exists(fasta_path)
def run(df,program="muscle",keep_tmp=False,**kwargs): alignment_functions = {"muscle":_align_muscle, "msaprobs":_align_msaprobs} # Figure out which alignment function to use try: alignment_function = alignment_functions[program] except KeyError: err = "Alignment program '{}' not recognized.\n\n".format(program) err += "Should be one of:\n" programs = list(alignment_functions.keys()) programs.sort() for p in programs: err += " {}\n".format(p) raise ValueError(err) # make a 10-character random string for temporary files tmp_file_root = "".join([random.choice(string.ascii_letters) for i in range(10)]) input_file = "{}_align_in.fasta".format(tmp_file_root) output_file = "{}_align_out.fasta".format(tmp_file_root) phy.seqio.write.to_fasta(df,id_col="uid",filename=input_file) alignment_function(input_file,output_file,**kwargs) # Parse muscle output new_seqs = phy.read_fasta("{}".format(output_file)) new_seqs = pd.DataFrame({"uid":new_seqs.id, "sequence":new_seqs.sequence}) # Drop the sequence information from the original frame to_merge = df.copy() to_merge = to_merge.drop(labels=["sequence"],axis=1) # Merge sequence information back in, this time aligned. output = pd.merge(to_merge, new_seqs, on=['uid'], how='left') # Make sure no na got introduced if np.sum(pd.isnull(output.sequence)) > 0: err = "an unknown error caused sequences to be lost during alignment.\n" err += "temporary files {} and {} saved.\n".format(input_file,output_file) raise RuntimeError(err) # Nuke temporary files if not keep_tmp: os.remove(output_file) os.remove(input_file) return output
def upload(): target = os.path.join(APP_ROOT, "files/") print(target) if not os.path.isdir(target): os.mkdir(target) for file in request.files.getlist("file"): #print("FILE:",file) filename = file.filename destination = "/".join([target, filename]) #print(destination) file.save(destination) #global fasta_path = destination global fasta_path fasta_path = destination #print(fasta_path) #print(type(fasta_path)) global fasta fasta = pp.read_fasta(fasta_path) #print(type(fasta)) #print (fasta) return render_template("algorithmSelectionPage.html")
def read_in_seqs(in_file): seqs = ph.read_fasta(in_file) return seqs
def run(): #print(fasta_path) fasta = pp.read_fasta(fasta_path) #print(fasta) target = os.path.join(APP_ROOT, "files") listDescription = [] enzy_filename = "RestrictionEnzymes.csv" destination_enz = "/".join([target, enzy_filename]) enzymes = pd.read_csv(destination_enz) enzymes = enzymes[0:84] enzy_pattern = enzymes['Recognition Sequence'] enzyme_patterns = [] for i in enzy_pattern: if 'S' not in i and 'Y' not in i and 'R' not in i and 'N' not in i and 'V' not in i and 'W' not in i and 'B' not in i and 'D' not in i and 'H' not in i and 'K' not in i and 'M' not in i: #if 'B' in i: enzyme_patterns.append(i) names_of_enzymes = [] df_enzymes = pd.DataFrame() for i in enzyme_patterns: df_enzymes = df_enzymes.append( enzymes.loc[enzymes['Recognition Sequence'] == i], ignore_index=True) names_of_enzymes = list(df_enzymes['Enzymes']) #df_enzymes.to_csv("Enzymesused.csv") #print(names_of_enzymes) i = 0 while i < 3: k = 0 if i == 0: descriptor = "/numCuts" if i == 1: descriptor = "/RMS" if i == 2: descriptor = "/avgMW" while k < len(names_of_enzymes): enzyme = names_of_enzymes[k] #print(enzyme) stringDescription = enzyme + descriptor listDescription.append(stringDescription) k += 1 i += 1 #print(len(listDescription)) j = 0 #fasta = phylopandas.read_fasta('fasta.fasta') seq_filename = fasta_path destination_seq = "/".join([target, seq_filename]) #fasta = pd.read_csv(destination_seq) #fasta = df #fasta.drop_duplicates('sequence',keep="first") #print(len(fasta)) c = 0 #for i in fasta['sequence']: testdf = pd.read_csv("pink.csv") for seq in fasta['sequence']: seq = seq.upper() num_of_cuts = [] listOfRMS = [] temp_cutLength_list = [] listAvgMoMa = [] tempFragmentMoMa_list = [] tempFragmentMoMa = 0 #inputfeature =[] #sequence = fasta.loc[i,'sequence' for j in enzyme_patterns: num_of_cuts.append(len(seq.split(j))) split1 = seq.split(j) for k in range(len(split1)): temp_cutLength = len(split1[k]) temp_cutLength_list.append(temp_cutLength) tempFragment = split1[k] for u in range(len(tempFragment)): if tempFragment[u] == 'A': tempFragmentMoMa = tempFragmentMoMa + 507 if tempFragment[u] == 'C': tempFragmentMoMa = tempFragmentMoMa + 483 if tempFragment[u] == 'G': tempFragmentMoMa = tempFragmentMoMa + 523 if tempFragment[u] == 'T': tempFragmentMoMa = tempFragmentMoMa + 483 tempFragmentMoMa_list.append(tempFragmentMoMa) rms = (sqrt(mean(square(temp_cutLength_list)))) listOfRMS.append(rms) mean_moma = mean(tempFragmentMoMa_list) listAvgMoMa.append(num_of_cuts) print(len(num_of_cuts), len(listOfRMS), len(listAvgMoMa)) cuts = pd.DataFrame([num_of_cuts]) RMSdf = pd.DataFrame([listOfRMS]) MOMadf = pd.DataFrame([listAvgMoMa]) inputfeature = pd.concat([cuts, RMSdf, MOMadf], axis=1) #inputcsv=inputfeature.to_csv(header=stringDescription) print(type(inputfeature)) print(inputfeature.shape) print(inputfeature.head) #print(inputcsv) #testinput= pd.read_csv(inputcsv) #featureinput = pd.DataFrame(listofinput) #print(len(featureinput)) #print(testinput) #print(pd.testinput.shape) return render_template("complete.html", f1=listAvgMoMa)
def run(df,cutoff=0.9,keep_tmp=False): """ Run cd-hit on a phylopandas data frame to remove similar sequences. Sequences are sorted from longest to shortest before running cd-hit, ensuring that the larger sequences always end up in the final output. df: phylopandas data frame to de-duplicate. cutoff: percent sequence identity cutoff (between 0 and 1.0) at which to place two sequences in the same cluster keep_tmp: keep temporary cd-hit files. """ # Sanity check, convert to string for later if cutoff < 0 or cutoff > 1.0: err = "cutoff must be a float between 0 and 1.0 (inclusive)\n" raise ValueError(err) cutoff = "{:.3f}".format(cutoff) # Make a temporary data frame tmp_df = df.copy() # Sort the data frame so the longest sequence occurs first tmp_df["seq_length"] = [len(s) for s in tmp_df.sequence] tmp_df = tmp_df.sort_values(by=["seq_length"],ascending=False) # make a 10-character random string for temporary files tmp_file_root = "".join([random.choice(string.ascii_letters) for i in range(10)]) input_file = "{}_cd-hit_in.fasta".format(tmp_file_root) out_root = "{}_cd-hit_out.fasta".format(tmp_file_root) # Write data frame to fasta file phy.seqio.write.to_fasta(tmp_df,id_col="uid",filename=input_file) # Construct cd-hit command cmd = ['cd-hit', "-i", input_file, "-o", out_root,"-c",cutoff] # Run cd-hit try: run = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdoutdata, stderrdata = run.communicate() except FileNotFoundError: err = "cd-hit does not appear to be in your path\n" raise RuntimeError(err) # Make sure it returned successfully if run.returncode != 0: err = "cd-hit failed\n" raise RuntimeError(err) # Parse cd-hit output new_seqs = phy.read_fasta("{}".format(out_root)) # Remove temporary files if not keep_tmp: os.remove(out_root) os.remove("{}.clstr".format(out_root)) os.remove(input_file) # Only grab sequences with uid that made it through cdhit return df[df.uid.isin(list(new_seqs.id))].copy()
def open_files(self, fasta_path, genome_path): self.genome_fasta = ph.read_fasta(fasta_path) self.genome_fasta.drop(columns=['label', 'uid', 'description'], inplace=True) self.genes_coordinate = pd.read_csv(genome_path, sep='\t',header=None, comment='#') self.genes_coordinate = self.genes_coordinate.rename(columns={0:'seqId_gene', 1:'source_gene', 2:'type_gene', 3:'start_gene', 4:'end_gene', 5:'score_gene', 6:'strand_gene', 7:'phase_gene', 8:'attributes_gene'})
import phylopandas as ph import string import pandas as pd df1 = ph.read_fasta('uniprot-membrane .fasta') df2 = ph.read_fasta('Uniq_MP.fasta') #print(df1.columns.values.tolist()) # 列名称 #print(df1.shape[0]) #print(df2.columns.values.tolist()) # 列名称 #print(df2.shape[0]) count = 1 f = open('01_2000pro.fasta', 'w') for i in range(0, len(df1)): for j in range(0, len(df2)): str = df1.loc[i]['id'].split('|')[1] if (str == df2.loc[j]['id']): break else: f.write('>' + str + '\n') f.write(df1.loc[i]['sequence'] + '\n') count = count + 1 #如果对比不同,则写入文档 if (count > 2000): break
def run(df, sequence_type="protein", min_conserved=None, min_flank=None, max_contig_noncon=8, min_block_length=10, allowed_gap="no", use_sim_matrix=True, min_initial_block=None, keep_tmp=False): """ Run Gblocks on a phylopandas dataframe. Full docs for software are here: http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_documentation.html#Installation As of this writing (August 2019), Gblocks can be installed using bioconda (https://bioconda.github.io/). sequence_type: type of sequence default: protein allowed: ("protein","dna","codon") gblock flag: -t min_conserved: minimum number of sequences for conserved position default: num_seqs*0.5 + 1 allowed: (num_seqs*0.5 + 1,num_seqs) gblock flag: -b1 min_flank: minimum number of sequences for a flank position default: 0.85*num_seqs allowed: (min_conserved,num_seqs) gblock flag: -b2 max_contig_noncon: maximum number of contiguous non-conserved positions default: 8 allowed: any integer > 0 gblock flag: -b3 min_block_length: minimum length of a block default: 10 allowed: any integer > 2 gblock flag: -b4 allowed_gap: allowed gap positions default: "no" allowed: ("no","half","all") gblock flag: -b5 use_sim_matrix: use similarity matrix default: True allowed: True,False (True only allowed for proteins) gblock flag: -b6 min_initial_block: minimum length of an initial block default: min_block_length allowed: any integer > 2 gblock flag: -b0 keep_tmp: keep temporary files and write out reports default: False allowed: False,True """ tmp_file_root = "".join( [random.choice(string.ascii_letters) for i in range(5)]) input_file = "{}.fasta".format(tmp_file_root) output_file = "{}.fasta-gb".format(tmp_file_root) output_summary = "{}.fasta-gb.htm".format(tmp_file_root) gblocks_stdout = "{}.stdout".format(tmp_file_root) gblocks_stderr = "{}.stderr".format(tmp_file_root) phy.seqio.write.to_fasta(df, id_col="uid", filename=input_file) cmd = ["Gblocks", input_file] num_seqs = len(df.uid) # Parse sequence type a = _qual_arg(sequence_type, "sequence_type", "t", { "protein": "p", "dna": "d", "codon": "c" }) cmd.append(a) # Parse min conserved min_allowed = int(math.ceil(0.5 * num_seqs) + 1) max_allowed = num_seqs if min_conserved is None: min_conserved = min_allowed a = _quant_arg(min_conserved, "min_conserved", "b1", min_allowed, max_allowed) cmd.append(a) # Parse min flank min_allowed = min_conserved max_allowed = sys.maxsize if min_flank is None: min_flank = int(math.ceil(0.85 * num_seqs)) a = _quant_arg(min_flank, "min_flank", "b2", min_allowed, max_allowed) cmd.append(a) # Parse max_contig_noncon a = _quant_arg(max_contig_noncon, "max_contig_noncon", "b3", 1, sys.maxsize) cmd.append(a) # Parse min_block_length a = _quant_arg(min_block_length, "min_block_length", "b4", 2, sys.maxsize) cmd.append(a) # Parse allowed_gap a = _qual_arg(allowed_gap, "allowed_gap", "b5", { "no": "n", "half": "h", "all": "a" }) cmd.append(a) # parse use_sim_matrix use_sim_matrix = bool(use_sim_matrix) if use_sim_matrix and sequence_type != "protein": use_sim_matrix = False a = _qual_arg(use_sim_matrix, "use_sim_matrix", "b6", { True: "y", False: "n" }) cmd.append(a) # parse min_initial_block if min_initial_block is None: min_initial_block = min_block_length a = _quant_arg(min_initial_block, "min_initial_block", "b0", 2, sys.maxsize) cmd.append(a) if keep_tmp: cmd.append("-d=y") else: cmd.append("-d=n") # Run gblocks stdout = open(gblocks_stdout, "w") stderr = open(gblocks_stderr, "w") try: output = subprocess.run(cmd, stdout=stdout, stderr=stderr) except FileNotFoundError: err = "Gblocks does not appear to be in your path\n" raise RuntimeError(err) stdout.close() stderr.close() # As far as I can tell, Gblocks *always* returns 1. At least on my # MacBook (Mojave 10.14.6, bioconda build.) if output.returncode != 1: err = "Gblocks failed for some reason\n" err += "\n\n------stdout-----\n\n" err += "".join(open(gblocks_stdout).read()) err += "\n\n------stderr-----\n\n" err += "".join(open(gblocks_stderr).read()) raise RuntimeError(err) # Parse output new_seqs = phy.read_fasta(output_file) new_seqs = pd.DataFrame({ "uid": new_seqs.id, "sequence": new_seqs.sequence }) num_old_columns = len(df.loc[0].sequence) num_new_columns = len(new_seqs.loc[0].sequence) if num_new_columns == 0: err = "\n\nGblocks removed all columns from alignment.\n" err += "Trying building a better alignment, editing it manually,\n" err += "or altering one or more Gblocks settings to be less aggressive.\n" err += "See help string on this function for details.\n\n" raise RuntimeError(err) print("Gblocks reduced the number of columns from {} to {}.\n".format( num_old_columns, num_new_columns)) # Drop the sequence information from the original frame to_merge = df.copy() to_merge = to_merge.drop(labels=["sequence"], axis=1) # Merge sequence information back in, this time aligned. output = pd.merge(to_merge, new_seqs, on=['uid'], how='left') # Make sure no na got introduced if np.sum(pd.isnull(output.sequence)) > 0: err = "an unknown error caused sequences to be lost during alignment.\n" err += "temporary files {} and {} saved.\n".format( input_file, output_file) raise RuntimeError(err) # Remove temporary files if not keep_tmp: os.remove(input_file) os.remove(output_file) os.remove(output_summary) os.remove(gblocks_stdout) os.remove(gblocks_stderr) return output
import phylopandas as ph import string from Bio import SeqIO import pandas as pd import numpy as np df1 = ph.read_fasta('1568122732.fas.db2novel') f = open('Negative_Samples.fasta', 'w') print(df1.shape[0]) #list=random.sample([i for i in range(len(df1))],10) list = np.random.choice(range(len(df1)), 271, replace=False) #随机选取271序号 #print(list) for i in range(len(list)): f.write('>' + df1.loc[list[i]]['id'] + '\n') f.write(df1.loc[list[i]]['sequence'] + '\n')