def detect_prophage(prefix_name,faa_file, base_dir='.', timing_log=None,threads=0): #TODO: include overwrite mode if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, 'element_finder_' +prefix_name) if not os.path.exists(path_out): os.makedirs(path_out) #Plasmid finder prophage_out = os.path.join(path_out,prefix_name + '_prophage.tsv') if os.path.isfile(prophage_out): return prophage_out # cmd = 'abricate --quiet --threads {threads} --nopath --db plasmidfinder {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=oriREP_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None #Plasmid finder gunzip_faa= faa_file; if faa_file.endswith('.gz'): gunzip_faa =os.path.join(path_out,prefix_name+'.faa') cmd = 'gunzip -c {} > {}'.format(faa_file, gunzip_faa) run_command(cmd) element_finder.search_prophage(sample=gunzip_faa,output=prophage_out,threads=threads) if os.path.exists(os.path.join(path_out,prefix_name+'.faa')): os.remove(os.path.join(path_out,prefix_name+'.faa')) return prophage_out
def detect_integron(prefix_name,assembly, base_dir='.', timing_log=None,threads=0): # TODO: include overwrite mode if threads == 0: threads = NUM_CORES_DEFAULT #path_out = os.path.join(base_dir, 'integron_finder_' + read_data['sample_id']) path_out = os.path.join(base_dir, prefix_name+'_integrall' ) if not os.path.exists(path_out): os.makedirs(path_out) #Plasmid finder integron_out = os.path.join(path_out, prefix_name + '_integron.tsv') if os.path.isfile(integron_out): return integron_out gunzip_fna= assembly; if assembly.endswith('.gz'): #FIXME: this step may not be necessary gunzip_fna =os.path.join(path_out,prefix_name+'.fasta') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna) run_command(cmd) # cmd = 'integron_finder {sequence} --func-annot --local-max --mute --outdir {outdir}'.format(sequence=read_data['assembly'],outdir=path_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd,timing_log) != 0: # return None element_finder.search_integrall(sample=gunzip_fna,output=integron_out,threads=threads) if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')): os.remove(os.path.join(path_out,prefix_name+'.fasta')) return integron_out
def run_roary(gff_folder, overwrite=False, threads=0, base_dir='.', timing_log=None): """ Run roay make pangeome analysis (using prokka results in previous step) :param read_data: result holder :param base_dir: working directory :param threads: number of core CPU :return: """ roary_folder = os.path.join(base_dir, 'pangenome/roary') temp_folder = os.path.join(base_dir, 'pangenome/temp_roary') roary_output = os.path.join(roary_folder, 'summary_statistics.txt') if os.path.isfile(roary_output) and (not overwrite): logger.info( 'roary has run and the input has not changed, skip roarying') return roary_folder if not os.path.isdir(temp_folder): os.makedirs(temp_folder) gff_list = [] for filename in os.listdir(gff_folder): if filename.endswith('.gz'): sample_id = filename.replace('.gff.gz', '') #gffgz_file = os.path.join(sample['annotation'], sample_id + '.gff.gz') gff_file = os.path.join(temp_folder, sample_id + '.gff') if run_command('gunzip -c {} > {}'.format( os.path.join(gff_folder, filename), gff_file)) != 0: raise Exception('Cannot get {}'.format( os.path.join(gff_folder, filename))) gff_list.append(gff_file) else: gff_list.append(os.path.join(gff_folder, filename)) # Make sure the directory is not there or roary will add timestamp if os.path.isfile(roary_folder): os.remove(roary_folder) if os.path.exists(roary_folder): shutil.rmtree(roary_folder) cmd = 'roary -p {} -f {} -v '.format(threads, roary_folder) + ' '.join(gff_list) ret = run_command(cmd, timing_log) if ret != 0: raise Exception('roary fail to run!') cmd = 'gzip ' + os.path.join(roary_folder, 'gene_presence_absence.csv') ret = run_command(cmd) if ret != 0: raise Exception('Error running {}'.format(cmd)) shutil.rmtree(temp_folder) return roary_folder
def qc_reads(prefix_name, reads, base_dir='.', threads=0, timing_log=None, **kargs): """ Run QC process for pair-end input using fastqc and multiqc :param read_data: result holder :param base_dir: working directory :return: path to qc output file """ if threads == 0: threads = NUM_CORES_DEFAULT out_fastqc = os.path.join(base_dir, prefix_name + '_fastqc') if not os.path.exists(out_fastqc): os.makedirs(out_fastqc) if 'pe1' in reads and 'pe2' in reads: cmd = 'fastqc -t {threads} -o {outdir} {pe1} {pe2}'.format( threads=threads, outdir=out_fastqc, pe1=reads['pe1'], pe2=reads['pe2']) fastqc_ret = run_command(cmd, timing_log) if fastqc_ret != 0: return None elif 'se' in reads: cmd = 'fastqc -t {threads} -o {outdir} {reads}'.format( threads=threads, outdir=out_fastqc, reads=reads['se']) fastqc_ret = run_command(cmd, timing_log) if fastqc_ret != 0: return None elif 'long-read' in reads: cmd = 'fastqc -t {threads} -o {outdir} {reads}'.format( threads=threads, outdir=out_fastqc, reads=reads['long-read']) fastqc_ret = run_command(cmd, timing_log) if fastqc_ret != 0: return None out_multiqc = os.path.join(base_dir, prefix_name + '_multiqc') if not os.path.exists(out_multiqc): os.makedirs(out_multiqc) cmd = 'multiqc -o {outdir} {indir}'.format(outdir=out_multiqc, indir=out_fastqc) multiqc_ret = run_command(cmd, timing_log) if multiqc_ret != 0: return None #read_data['fastqc']=out_fastqc return os.path.join(out_multiqc, 'multiqc_data', 'multiqc_fastqc.txt')
def detect_virulome(prefix_name,assembly, base_dir='.', threads=0, timing_log=None): """ Run in-house script to identify virulent genes using VFDB Parameters ---------- prefix_name: name to attach to output assembly: str input sequence threads: int number of threads to use overwrite:bool whether to overwrite the existing result timing_log: str log file Returns ------- path to virulent gene file """ #TODO: to include overwrite if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name+'_element_finder' ) if not os.path.exists(path_out): os.makedirs(path_out) vir_out = os.path.join(path_out, prefix_name + '_virulome.tsv') if os.path.isfile(vir_out): return vir_out # cmd = 'abricate --quiet --threads {threads} --nopath --db vfdb {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=vir_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None gunzip_fna = assembly if assembly.endswith('.gz'): #FIXME: review this step gunzip_fna =os.path.join(path_out,prefix_name+'.fasta') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna) run_command(cmd) element_finder.search_virulome(sample=gunzip_fna,output=vir_out,threads=threads) if not os.path.exists(os.path.join(path_out,prefix_name+'.fasta')): os.remove(os.path.join(path_out,prefix_name+'.fasta')) return vir_out
def detect_insertion_sequence(prefix_name,assembly, base_dir='.', threads=0): """ Run isescan for searching IS :param read_data: result holder :param base_dir: working directory :return: path to output file in result holder """ # TODO: include overwrite mode if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name+"_isescan") if not os.path.exists(path_out): os.makedirs(path_out) isescan_out = os.path.join(path_out, prefix_name + '_is.tsv') if os.path.isfile(isescan_out): return isescan_out gunzip_fna = assembly if assembly.endswith('.gz'): #FIXME: this step is not needed gunzip_fna =os.path.join(path_out,prefix_name+'.fasta') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna) run_command(cmd) #Plasmid finder cmd = 'isescan.py --nthread {threads} --seqfile {asm} --output {output} '.format( threads=threads, asm=gunzip_fna, output=path_out ) if run_command(cmd) != 0: return None #if os.path.exists(path_out+'/prediction'): # shutil.rmtree(path_out+'/prediction') #shutil.copytree('prediction', path_out+'/prediction') #read_data['is'] = path_out+'/prediction' isout=None for root, dirs, files in os.walk(path_out, topdown=False): for name in files: if name.endswith('.raw'): isout=os.path.join(root, name) #if os.path.exists('prediction'): # shutil.rmtree('prediction') if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')): os.remove(os.path.join(path_out,prefix_name+'.fasta')) return isout
def assembly_eval(prefix_name, assembly, base_dir='.', threads=0, timing_log=None, **kargs): """ Run QC process for assembly output :param read_data: result holder :param base_dir: working directory :return: path to report file """ if threads == 0: threads = NUM_CORES_DEFAULT out_quast = os.path.join(base_dir, prefix_name + '_quast') if not os.path.exists(out_quast): os.makedirs(out_quast) cmd = 'quast.py -t {threads} -o {outdir} {input}'.format(threads=threads, outdir=out_quast, input=assembly) quast_ret = run_command(cmd, timing_log) if quast_ret != 0: return None return os.path.join(out_quast, 'report.tsv')
def species_identification_kraken(prefix_name, assembly, db='db/kraken2/k2std', base_dir='.', timing_log=None, threads=0): """ Run kraken2 for identifying species :param read_data: result holder :param base_dir: working directory :return: path to output file in result holder """ if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name + '_kraken2') kraken2_report = os.path.join(path_out, prefix_name + '_kraken2.tsv') if not os.path.exists(path_out): os.makedirs(path_out) cmd = 'kraken2 --db {db} --use-names --threads {threads} --report {report} {asm}'.format( db=db, threads=threads, report=kraken2_report, asm=assembly) cmd = "bash -c '{}'".format(cmd) if run_command(cmd, timing_log) != 0: return None return kraken2_report
def run_protein_alignment(roary_folder, collection_dir, threads=8, overwrite=False, timing_log=None): """ Align protein sequence by mafft Parameters ---------- report: object A report object collection_dir: str working directory of the collection threads: int number of threads to use overwrite: bool whether to overwrite existing result even if input did not change timing_log: str file to log timing Returns report object ------- """ alignment_dir = os.path.join(collection_dir, 'alignments') gene_cluster_file =roary_folder + '/gene_presence_absence.Rtab' gene_df = pd.read_csv(gene_cluster_file, sep='\t', index_col='Gene') gene_df.fillna('', inplace=True) cmds_file = os.path.join(alignment_dir,"align_cmds") with open(cmds_file,'w') as cmds: for gene_id, row in gene_df.iterrows(): # Only align if there are at least 2 sequences if row.sum() < 2: continue gene_id = re.sub(r'\W+', '', gene_id) gene_dir = os.path.join(alignment_dir, gene_id) # check if done before gene_aln_file = os.path.join(gene_dir, gene_id + '.faa.aln.gz') if (not overwrite) and os.path.isfile(gene_aln_file): continue gene_seq_file = os.path.join(gene_dir, gene_id + '.faa') if not os.path.isfile(gene_seq_file): logger.info('{} does not exist'.format(gene_aln_file)) continue cmd = f"mafft --auto --quiet --thread 1 {gene_seq_file} | gzip > {gene_aln_file}" cmds.write(cmd + '\n') cmd = f"parallel --bar -j {threads} -a {cmds_file}" ret = run_command(cmd, timing_log) #report['alignments'] = alignment_dir return alignment_dir
def run_species_phylogeny_iqtree(roary_folder, collection_dir, threads=8, overwrite=False, timing_log=None): """ Run iqtree to create phylogeny tree from core gene alignment. If the list of samples has not changed, and none of the samples has changed, the existing tree will be kept unless overwrite is set to True Parameters ---------- report: object A report object collection_dir: str working directory of the collection threads: int number of threads to use overwrite: bool whether to overwrite existing result even if input did not change timing_log: str file to log timing Returns report object ------- """ phylogeny_folder = os.path.join(collection_dir, 'phylogeny') if not os.path.exists(phylogeny_folder): os.makedirs(phylogeny_folder) #report['phylogeny'] = phylogeny_folder phylogeny_file = os.path.join(phylogeny_folder, 'core_gene_alignment.treefile') if os.path.isfile(phylogeny_file) and (not overwrite): logger.info( 'phylogeny tree exists and input has not changed, skip phylogeny analysis' ) return phylogeny_folder aln_file = os.path.join(phylogeny_folder, 'core_gene_alignment.aln.gz') if not os.path.isfile(aln_file): aln_file = os.path.join(report['roary'], 'core_gene_alignment.aln.gz') cmd = 'iqtree -s {alignment} --prefix {prefix} -B 1000 -T {threads} -czb -keep-ident'.format( alignment=aln_file, prefix=phylogeny_folder + '/core_gene_alignment', threads=threads) ret = run_command(cmd, timing_log) if ret != 0: raise Exception( 'iqtree fail to create phylogeny tree from core gene alignment!') return phylogeny_folder
def detect_amr_abricate(prefix_name, assembly, base_dir='.', threads=8, overwrite=False, timing_log=None): """ Run abricate to identify resistant genes Parameters ---------- sample: a dictionary-like object containing various attributes for a sample sample_dir: str the directory of the sample threads: int number of threads to use overwrite:bool whether to overwrite the existing result timing_log: str log file Returns ------- path to resistant gene file """ path_out = os.path.join(base_dir, prefix_name+'_abricate') if not os.path.exists(path_out): os.makedirs(path_out) # TODO: replace by consensus db later amr_out = os.path.join(path_out, prefix_name+ '_resistome.tsv') if os.path.isfile(amr_out) and (not overwrite): logger.info('Resistome for {} exists, skip analysis'.format(prefix_name)) return amr_out dbs=['ncbi','megares','ecoh','argannot','card','resfinder'] numError=0 outputfiles=[] for db in dbs: outfile= os.path.join(path_out,prefix_name + '_'+db+'.tsv') cmd = 'abricate --quiet --threads {threads} --nopath --db {db} {infile} > {outfile}'.format( threads=threads, db=db, infile=assembly, outfile=outfile) if run_command(cmd, timing_log) != 0: numError=numError+1 else: outputfiles.append(outfile) if numError==len(dbs): raise Exception('Error running amr') combined_tsv = pd.concat([pd.read_csv(f,sep='\t') for f in outputfiles ]) combined_tsv.sort_values(['SEQUENCE','START'],ascending=[True, True],inplace=True) combined_tsv.to_csv(amr_out, index=False,sep='\t', encoding='utf-8-sig') #sample['updated'] = True return amr_out
def detect_mlst(prefix_name, assembly, base_dir='.', timing_log=None, threads=0): #TODO: include overwrite if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name + '_mlst') if not os.path.exists(path_out): os.makedirs(path_out) mlst_out = os.path.join(path_out, prefix_name + '_mlst.tsv') if os.path.isfile(mlst_out): return mlst_out gunzip_fna = assembly if assembly.endswith('.gz'): #FIXME: check if this is needed gunzip_fna = os.path.join(path_out, prefix_name + '.fasta') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna) run_command(cmd) m = mlst.find_mlst(gunzip_fna) with open(mlst_out, 'w') as f: f.write("%s\t%s\t%s" % (m['file'], m['scheme'], m['st'])) for gene in m['profile']: f.write("\t%s" % gene) f.write("\n") # cmd = 'mlst --quiet --threads {threads} --nopath {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=mlst_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None if os.path.exists(os.path.join(path_out, prefix_name + '.fasta')): os.remove(os.path.join(path_out, prefix_name + '.fasta')) return mlst_out
def detect_pmlst(prefix_name,assembly, base_dir='.', threads=0): if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir,prefix_name+'_pmlst' ) if not os.path.exists(path_out): os.makedirs(path_out) #Plasmid finder pmlst_out = os.path.join(path_out, prefix_name + '_pmlst.tsv') if os.path.isfile(pmlst_out): return pmlst_out # cmd = 'abricate --quiet --threads {threads} --nopath --db plasmidfinder {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=oriREP_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None gunzip_fna = assembly if assembly.endswith('.gz'): #FIXME: review this gunzip_fna =os.path.join(path_out,prefix_name+'.fasta') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fna) run_command(cmd) m=mlst.find_mlst(query_file=gunzip_fna,blastdb='db/pmlst/blast/pmlst.fa',mlstdb='db/pmlst/pubmlst',num_threads=threads) with open(pmlst_out, 'w') as f: f.write("%s\t%s\t%s"%(m['file'],m['scheme'],m['st'])) for gene in m['profile']: f.write("\t%s"%gene) f.write("\n") # cmd = 'mlst --quiet --threads {threads} --nopath {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=mlst_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None if os.path.exists(os.path.join(path_out,prefix_name+'.fasta')): os.remove(os.path.join(path_out,prefix_name+'.fasta')) return pmlst_out
def map_reads_to_assembly_bwamem(prefix_name, assembly, reads, base_dir='.', threads=0, memory=50, timing_log=None, **kargs): if not os.path.isfile(assembly + '.sa'): cmd = 'bwa index ' + assembly ret = run_command(cmd, timing_log) if ret != 0: return None path_out = os.path.dirname(assembly) cmd = 'bwa mem -t {threads} {index}'.format(threads=threads, index=assembly) if 'pe1' in reads and 'pe2' in reads: pe_sam = os.path.join(path_out, prefix_name + '_pe.sam') pe_bam = os.path.join(path_out, prefix_name + '_pe.bam') cmd_bwa_pe = cmd + ' ' + reads['pe1'] + ' ' + reads[ 'pe2'] + ' > ' + pe_sam run_command(cmd_bwa_pe, timing_log) cmd_st_pe = 'samtools view -u {sam} | samtools sort -@{threads} -o {bam} - ;samtools index {bam}'.format( sam=pe_sam, threads=threads, bam=pe_bam) run_command(cmd_st_pe, timing_log) return pe_bam if 'se' in reads: se_sam = os.path.join(path_out, prefix_name + '_se.sam') se_bam = os.path.join(path_out, prefix_name + '_se.bam') cmd_bwa_se = cmd + ' ' + reads['se'] + ' > ' + se_sam run_command(cmd_bwa_se, timing_log) cmd_st_se = 'samtools view -u {sam} | samtools sort -@{threads} -o {bam} - ;samtools index {bam}'.format( sam=se_sam, threads=threads, bam=se_bam) run_command(cmd_st_se, timing_log) return se_bam
def detect_amr_amrfinder(prefix_name,faa_file,fna_file,gff_file,genus=None,species=None, base_dir='.', db='db/amrfinderplus/data/latest', timing_log=None, threads=0): """ Run AMR analysis, using AMRfinderPlus for searching amr genes, virulome genes and point mutaions. :param read_data: result holder :param base_dir: working directory :return: path to output file in result holder """ if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name+'_amrfinder') if not os.path.exists(path_out): os.makedirs(path_out) #AMR profiling with CARD. TODO: replace by consensus db later ret_out = os.path.join(path_out, prefix_name + '_amr.tsv') amr_out = os.path.join(path_out, prefix_name+ '_resistome.tsv') virulen_out = os.path.join(path_out, prefix_name + '_virulome.tsv') point_out = os.path.join(path_out, prefix_name + '_point.tsv') #using abricate # cmd = 'abricate --quiet --threads {threads} --nopath --db card {infile} > {outfile}'.format(threads=threads,infile=read_data['assembly'],outfile=amr_out) # cmd = "bash -c '{}'".format(cmd) # if run_command(cmd) != 0: # return None #using build-in function #element_finder.search_amr(sample=read_data['assembly'],output=amr_out,threads=threads) #using AMRFinderPlus #process files in prokka folder, prepare for using amrfinder #move files from prokka to temp folder temp_dir = os.path.join(base_dir, 'amr_temp') if not os.path.exists(temp_dir): os.makedirs(temp_dir) temp_gff_file=os.path.join(temp_dir,prefix_name+'.gff') source_gff_file=None # for root, dirs, files in os.walk(read_data['annotation']): # for _file in files: # if _file.endswith(('.faa')): # faa_file = shutil.copyfile(os.path.join(str(root),_file), faa_file) # if _file.endswith(('.fna')): # fna_file = shutil.copyfile(os.path.join(str(root),_file), fna_file) # if _file.endswith(('.gff')): # source_gff_file = os.path.join(str(root),_file) source_gff_file=gff_file #add Name property to column 9 of gff file (AMRfinder need it!) and remove #fasta section if not source_gff_file==None: destination= open(temp_gff_file, "w" ) #source= open( source_gff_file, "r" ) with gzip.open(source_gff_file,'rt') as source: for line in source: if line.startswith('##FASTA'): break if line.startswith('##'): destination.write( line ) else: newline=line.replace('ID=','Name=') destination.write( newline ) #source.close() destination.close() gunzip_faa= faa_file; if faa_file.endswith('.gz'): gunzip_faa =os.path.join(temp_dir,prefix_name+'.faa') cmd = 'gunzip -c {} > {}'.format(faa_file, gunzip_faa) run_command(cmd) gunzip_fna= fna_file; if fna_file.endswith('.gz'): gunzip_fna =os.path.join(temp_dir,prefix_name+'.fna') cmd = 'gunzip -c {} > {}'.format(fna_file, gunzip_fna) run_command(cmd) cmd = 'amrfinder -d {database} -p {faa_file} -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\ .format( database=db, faa_file=gunzip_faa, fna_file=gunzip_fna, gff_file=temp_gff_file, threads=threads, outfile=ret_out ) #full option if has --Genus if not genus==None: organism = genus.capitalize() if not species==None and not species=='': organism = species.replace(' ','_') organisms = ['Campylobacter', 'Enterococcus_faecalis', 'Enterococcus_faecium', 'Escherichia', 'Klebsiella', 'Salmonella', 'Staphylococcus_aureus', 'Staphylococcus_pseudintermedius', 'Vibrio_cholerae'] if organism in organisms: cmd = 'amrfinder -d {database} -p {faa_file} -O {organism} -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\ .format( database=db, faa_file=gunzip_faa, organism=organism, fna_file=gunzip_fna, gff_file=temp_gff_file, threads=threads, outfile=ret_out ) else: cmd = 'amrfinder -d {database} -p {faa_file} -n {fna_file} -g {gff_file} --plus --threads {threads} -o {outfile}'\ .format( database=db, faa_file=gunzip_faa, fna_file=gunzip_fna, gff_file=temp_gff_file, threads=threads, outfile=ret_out ) cmd = "bash -c '{}'".format(cmd) if run_command(cmd,timing_log) != 0: return None #clean up: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) #proccess output files: virulen=[] amr=[] point=[] header=[] with open(ret_out) as tsvfile: reader = csv.DictReader(tsvfile, dialect='excel-tab') for row in reader: header=row.keys() if row['Element type']=='VIRULENCE': virulen.append(row) elif row['Element subtype']=='POINT': point.append(row) else: amr.append(row) with open(amr_out, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t') writer.writeheader() for row in amr: writer.writerow(row) with open(point_out, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t') writer.writeheader() for row in point: writer.writerow(row) with open(virulen_out, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=header,delimiter='\t') writer.writeheader() for row in virulen: writer.writerow(row) if os.path.exists(ret_out): os.remove(ret_out) return amr_out,point_out,virulen_out
def annotate_prokka(prefix_name, assembly, genus=None, species=None, strain=None, gram=None, base_dir='.', overwrite=False, timing_log=None, threads=0): """ Run annotation process using prokka :param read_data: result holder :param base_dir: working directory :return: path to output file in result holder """ if threads == 0: threads = NUM_CORES_DEFAULT path_out = os.path.join(base_dir, prefix_name + '_prokka') if not os.path.exists(path_out): os.makedirs(path_out) annotation_gbk = os.path.join(path_out, prefix_name + '.gbk.gz') annotation_gff = path_out + '/' + str(prefix_name) + '.gff.gz' annotation_faa = path_out + '/' + str(prefix_name) + '.faa.gz' annotation_ffn = path_out + '/' + str(prefix_name) + '.ffn.gz' annotation_fna = path_out + '/' + str(prefix_name) + '.fna.gz' if os.path.isfile(annotation_gff) and os.path.isfile(annotation_gbk) and ( not overwrite): # Dont run again if gff/gbk file exists logger.info('GFF and GBK files found, skip annotating') return annotation_gff, annotation_faa, annotation_ffn, annotation_fna, annotation_gbk gunzip_fasta = assembly if assembly.endswith('.gz'): gunzip_fasta = os.path.join(path_out, prefix_name + '.fin') cmd = 'gunzip -c {} > {}'.format(assembly, gunzip_fasta) run_command(cmd) cmd = 'prokka --force --cpus {threads} --addgenes --mincontiglen 200'.format( threads=threads) cmd += ' --prefix {sample_id} --locus {sample_id} --outdir {path} '.format( sample_id=prefix_name, path=path_out) if not genus == None and genus: cmd += ' --genus ' + genus if not species == None and species: species = species.replace(' ', '_') cmd += ' --species ' + species if not strain == None and strain: cmd += ' --strain ' + strain if not gram == None and gram: cmd += ' --gram ' + gram cmd += ' ' + gunzip_fasta cmd = "bash -c '{}'".format(cmd) ret = run_command(cmd, timing_log) if ret != 0: raise Exception('Command {} returns non-zero ()!'.format(cmd, ret)) for file_name in glob.glob(os.path.join(path_out, '*')): ext = file_name[-3:] if ext in ['gff', 'gbk', 'ffn', 'faa', 'fna']: # fna? run_command('gzip {}'.format(file_name)) else: os.remove(file_name) return annotation_gff, annotation_faa, annotation_ffn, annotation_fna, annotation_gbk
def run_gene_phylogeny_iqtree(roary_folder, collection_dir, threads=8, overwrite=False, timing_log=None): """ Run phylogenetic analysis of gene clusters. If the list of samples has not changed, and none of the samples has changed, the existing tree will be kept unless overwrite is set to True Parameters ---------- report: object A report object collection_dir: str working directory of the collection threads: int number of threads to use overwrite: bool whether to overwrite existing result even if input did not change timing_log: str file to log timing Returns report object ------- """ alignment_dir = os.path.join(collection_dir, 'alignments') gene_cluster_file = roary_folder + '/gene_presence_absence.Rtab' gene_df = pd.read_csv(gene_cluster_file, sep='\t', index_col='Gene') gene_df.fillna('', inplace=True) cmds_file = os.path.join(alignment_dir, "phylo_cmds") with open(cmds_file, 'w') as cmds: for gene_id, row in gene_df.iterrows(): # Only analyse if there are at least 3 genes if row.sum() < 3: continue gene_id = re.sub(r'\W+', '', gene_id) gene_dir = os.path.join(alignment_dir, gene_id) if not os.path.exists(gene_dir): os.makedirs(gene_dir) # check if done before iqtree_output = os.path.join(gene_dir, gene_id + '.treefile') if (not overwrite) and os.path.isfile(iqtree_output): continue gene_aln_file_roary = os.path.join(roary_folder, 'pan_genome_sequences', gene_id + '.fa.aln') gene_aln_file = os.path.join(gene_dir, gene_id + '.fna.aln.gz') if os.path.isfile(gene_aln_file_roary): shutil.move(gene_aln_file_roary, gene_aln_file) if not os.path.isfile(gene_aln_file): logger.info('{} does not exist'.format(gene_aln_file)) continue cmd = f"iqtree -s {gene_aln_file} --prefix {gene_dir+'/'+gene_id} -m GTR -quiet -T 1 -B 1000 2> /dev/null" cmd += f" || iqtree -s {gene_aln_file} --prefix {gene_dir+'/'+gene_id} -m GTR -quiet -T 1" # translate to protein alignment #protein_aln_file = os.path.join(gene_dir, gene_id + '.faa.aln') #with open(protein_aln_file, 'w') as fh: # for record in SeqIO.parse(gene_aln_file, 'fasta'): # trans = translate_dna(str(record.seq)) # new_record = SeqRecord(Seq(trans), id=record.id,) # SeqIO.write(new_record, fh, 'fasta') #cmd = f"iqtree -s {protein_aln_file} --prefix {gene_dir+'/'+gene_id} -m LG -quiet -T 1" #cmd = f"fasttree -nt -gtr -quiet {gene_aln_file} > {gene_dir+'/'+gene_id+'.treefile'} && echo '{gen_list_string}' > {gene_list_json}" cmds.write(cmd + '\n') cmd = f"parallel --bar -j {threads} -a {cmds_file}" ret = run_command(cmd, timing_log) return alignment_dir
def run_phylogeny_parsnp(base_dir, ref_genome, genome_dir='.', threads=0): """ Run parsnp to create phylogeny tree :param read_data: result holder :param ref_genome: path to reference genome, if equal None, one of genome in genome directory will be choosed to be reference. :param base_dir: working directory :param threads: number of core CPU :return: """ phylogeny_folder = os.path.join(base_dir, 'pangenome/phylogeny') if not os.path.exists(phylogeny_folder): os.makedirs(phylogeny_folder) else: return phylogeny_folder temp_folder = os.path.join(phylogeny_folder, 'temp_phylo') if not os.path.isdir(temp_folder): os.makedirs(temp_folder) #take first genome to get reference genome sample_list = [] files = os.listdir(genome_dir) if ref_genome == None: #pick a file in genome_dir to make ref for i, f in enumerate(files): fasta_file = os.path.join(temp_folder, os.path.basename(f)) cmd = 'gunzip -c {} > {}'.format(os.path.join(genome_dir, f), fasta_file) run_command(cmd) cmd = 'cat {} > {}'.format(os.path.join(genome_dir, f), fasta_file) run_command(cmd) if i == 0: ref_genome = fasta_file else: sample_list.append(fasta_file) else: for i, f in enumerate(files): fasta_file = os.path.join(temp_folder, os.path.basename(f)) cmd = 'gunzip -c {} > {}'.format(os.path.join(genome_dir, f), fasta_file) run_command(cmd) cmd = 'zcat {} > {}'.format(f, fasta_file) run_command(cmd) sample_list.append(fasta_file) myCmd = 'parsnp -r {} -d {} -o {} -p {}'.format(ref_genome, ' '.join(sample_list), phylogeny_folder, threads) print(myCmd) et = run_command(myCmd, timing_log) if ret != 0: raise Exception('Error running parsnp') run_command('gzip {}'.format(os.path.join(phylogeny_folder, 'parsnp.xmfa'))) run_command('gzip {}'.format(os.path.join(phylogeny_folder, 'parsnp.ggr'))) shutil.rmtree(temp_folder) return phylogeny_folder
def run_alignment_by_parsnp(roary_folder,ffn_dir,base_dir, overwrite=False, timing_log=None,threads=0): """ Run aligment process to create both multi-alignment and phylogeny tree for each gene in gene clusters :param read_data: result holder :param ffn_dir: path to folder of .ffn (output of prokka) :param base_dir: working directory :return: """ gene_cluster_file=roary_folder+'/gene_presence_absence.csv.gz' dict_cds={} for root, dirs, files in os.walk(ffn_dir): for _file in files: if _file.endswith('.ffn.gz'): with gzip.open(os.path.join(root, _file), 'rt') as fn: for seq in SeqIO.parse(fn, 'fasta'): dict_cds[seq.id] = seq #make folder contains sequences for each gene alignment_dir=os.path.join(base_dir,'alignments') if (not overwrite) and os.path.exists(alignment_dir): return alignment_dir if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) gene_df = pd.read_csv(gene_cluster_file, dtype=str) gene_df.fillna('', inplace=True) sample_columns = list(gene_df.columns)[14:] for _, row in gene_df.iterrows(): gene_id = row['Gene'] gene_list = [] for sample_column in sample_columns: if row[sample_column]: # roary can pool together genes from the same sample and tab-separate them for sample_gene in row[sample_column].split('\t'): gene_list.append(sample_gene) # TODO: make sure all samples in this gene have not updated gene_list = sorted(gene_list) # Only analyse if there are more than 3 genes if len(gene_list) < 3: logger.info('There are too few genes for {} skipping'.format(gene_id)) continue gene_dir = os.path.join(alignment_dir, gene_id) # Check if done before gene_list_json = os.path.join(gene_dir, 'gene_list.json') # if os.path.isfile(os.path.join(gene_dir, 'parsnp.tree')) and (not overwrite): if os.path.isfile(gene_list_json): with open(gene_list_json) as fn: existing_gene_list = json.load(fn) if gene_list == existing_gene_list: logger.info('Phylogeny for gene {} done, skipping'.format(gene_id)) continue # for _, row gene_file_dir = os.path.join(gene_dir, 'files') if not os.path.exists(gene_file_dir): os.makedirs(gene_file_dir) gene_files = [] for sample_gene in gene_list: gene_file = os.path.join(gene_file_dir, sample_gene + '.fasta') SeqIO.write(dict_cds[sample_gene], gene_file, 'fasta') gene_files.append(gene_file) # Use the first gene as the reference cmd = 'parsnp -d {} -r {} -o {} -p {}'.format( ' '.join(gene_files[1:]), gene_files[0], gene_dir, threads) ret = run_command(cmd) # if ret != 0: # raise Exception('error') with open(gene_list_json, 'w') as fn: json.dump(gene_list, fn) #run_command('gzip {}'.format(os.path.join(gene_dir, 'parsnp.xmfa'))) #run_command('gzip {}'.format(os.path.join(gene_dir, 'parsnp.ggr'))) if os.path.exists(gene_file_dir): shutil.rmtree(gene_file_dir) #clean up run_command('rm -f ' + os.path.join(gene_dir, '*.ini ') + os.path.join(gene_dir, '*block* ')) shutil.rmtree(os.path.join(gene_dir, 'blocks'), True) shutil.rmtree(os.path.join(gene_dir, 'tmp'), True) return alignment_dir