#os.system('gunzip dataflow/01-nucl/mags/*.gz') # rename the headers and concatenate all of the files files = df_mags['file_unzip'].tolist() files_rename = [] for file in files: file_obj = sc.Fasta(file, 'dataflow/01-nucl/mags/') outname = file.split('.fa')[0] + '_rename.fasta' files_rename.append(outname) file_obj.setOutputName(outname) file_obj.setOutputLocation('dataflow/01-nucl/mags/') #file_obj.headerrename() sg.concat(inputfolder='dataflow/01-nucl/mags/', outputpath='dataflow/01-nucl/stewart2019_mags.fasta', filenames=files_rename) # predict the ORFs (nucl and prot) for all the MAGS file_obj = sc.Fasta('stewart2019_mags.fasta', 'dataflow/01-nucl/') file_obj.setOutputName('stewart2019_mags_genes.fasta') file_obj.setOutputLocation('dataflow/01-nucl/') file_obj.runprodigal(type='nucl') file_obj = sc.Fasta('stewart2019_mags.fasta', 'dataflow/01-nucl/') file_obj.setOutputName('stewart2019_mags_prot.fasta') file_obj.setOutputLocation('dataflow/01-prot/') file_obj.runprodigal()
'4309680-submission.assembly_59', '3964017-submission.assembly_7', '3643350-assembly_6', '3394949-submission.assembly_17', 'RUG117_52' ] file_obj = sc.Fasta('rumen_genomes.fasta', 'dataflow/01-nucl/') file_obj.setOutputName('subclade_island.fasta') file_obj.setOutputLocation('dataflow/01-nucl/') file_obj.subsetfasta(seqlist=genes, headertag='none') # 4309680-submission.assembly_59 was then blasted against the # NCBI nucleotide collection (nr/nt) using web-based blastn and the full-length sequence for each of the top 50 hits # was downloaded and concatenated into island2_pathogens.fasta. With the contigs selected above into rumen_genomes_island2_pathogens.fasta. files = ['island2_pathogens.fasta', 'subclade_island.fasta'] sg.concat(inputfolder='dataflow/01-nucl/', outputpath='dataflow/01-nucl/rumen_genomes_island2_pathogens.fasta', filenames=files) # a blast database was then made with the contigs of interest, including 4309680-submission.assembly_59 file = "subclade_island.fasta" indir = 'dataflow/01-nucl/' blastdir = 'dataflow/02-blast/' blastdbdir = 'dataflow/02-blast-db/' file_obj = sc.Fasta(file, indir) file_obj.setOutputName(file) file_obj.setOutputLocation(blastdbdir) file_obj.runmakeblastdb(dbtype='nucl') file = "rumen_genomes_island2_pathogens.fasta"
from modules import seq_gen_lin as sg # Genomes that were downloaded (Figure 1 and Figure 2) were combined into fig1_fig3_ncbi_nucl_hits.fasta # Note the file name was made before fig 3 was moved to fig 2. These were then combined # with the rumen genomes (pathogens_rumen.fasta) and made into a blast database. file = "fig1_fig3_ncbi_nucl_hits.fasta" file_obj = sc.Fasta(file, 'dataflow/01-nucl/') file_obj.setOutputName(file) file_obj.setOutputLocation('dataflow/01-prot/') file_obj.runprodigal() seqs_concatn = ['rumen_genomes.fasta', 'fig1_fig3_ncbi_nucl_hits.fasta'] sg.concat(inputfolder='dataflow/01-prot/', outputpath='dataflow/01-prot/pathogens_rumen.fasta', filenames=seqs_concatn) file = "pathogens_rumen.fasta" blastdbdir = 'dataflow/02-blast-db/' file_obj = sc.Fasta(file, 'dataflow/01-prot/') file_obj.setOutputName(file) file_obj.setOutputLocation(blastdbdir) file_obj.runmakeblastdb(dbtype='prot') # The two version of ANT6 (v1_v2_4309680.fasta) were then blasted against the pathogen and rumen genomes. indir = 'dataflow/01-prot/' blastdir = 'dataflow/02-blast/' file = "v1_v2_4309680.fasta"
file_obj = sc.Fasta(file, 'dataflow/01-nucl/') outputfilename = file.split(".f")[0] + '_extractedCONTIGs_all_rumen' + '.fasta' file_obj.setOutputName(outputfilename) file_obj.setOutputLocation('dataflow/01-nucl/') file_obj.extractORFs_gff3( gff3_table_loc='dataflow/00-meta/resistance_blast_hit_cotigs_all_rumen.csv' ) files = [ "resistance_island_blast_hits_concatenated_extractedCONTIGs_3rumen.fasta", "rumen_genomes_extractedCONTIGs_all_rumen.fasta" ] sg.concat( inputfolder='dataflow/01-nucl/', outputpath='dataflow/01-nucl/rumen_genomes_extractedCONTIGs_all.fasta', filenames=files) file = "rumen_genomes_extractedCONTIGs_all.fasta" indir = 'dataflow/01-nucl/' blastdir = 'dataflow/02-blast/' file_obj = sc.Fasta(file, indir) file_obj.setOutputLocation(blastdir) outputfilename = "resistance_island_mapping2.txt" blastdb = "rumen_genomes_resistance_genes.fasta" file_obj.setOutputName(outputfilename) file_obj.runblast(blast='blastn', db=blastdb,
from modules import seq_gen_lin as sg # concatenate all of the downloaded pathogen assemblies and then make blast DBs for each dirs = [ 'staphylococcus_aureus', 'campylobacter_jejuni', 'campylobacter_coli', 'clostridioides_difficile', 'acinetobacter_baumannii', 'streptococcus_pneumoniae' ] head_dir = 'dataflow/01-nucl/' for dir in dirs: path_dir = head_dir + dir + '/' unzip_command = 'gunzip ' + path_dir + '*.gz' os.system(unzip_command) lis = [f for f in os.listdir(path_dir) if f.endswith(".fna")] output_file = head_dir + dir + '.fasta' sg.concat(inputfolder=path_dir, outputpath=output_file, filenames=lis) files = [ 'staphylococcus_aureus', 'campylobacter_jejuni', 'campylobacter_coli', 'clostridioides_difficile', 'acinetobacter_baumannii', 'streptococcus_pneumoniae' ] for file in files: file_obj = sc.Fasta(file, 'dataflow/01-nucl/') file_obj.setOutputName(file) file_obj.setOutputLocation('dataflow/02-blast-db/') file_obj.runmakeblastdb(dbtype='nucl')