def cleanupAndWrite(assembly_file,output_file,length=None,coverage=None,image_file=None,base_ID=None): ##Note: no sanity checks ## Load the assemblies assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file) output_format,output_compressed = utilities.guessFileFormat(output_file) if assembly_format != output_format: print("Warning on cleanup: input and output formats do not match ({} and {})".format(assembly_format,output_format)) with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin: seqs = [c for c in SeqIO.parse(fin,assembly_format)] if base_ID is not None: new_contigs, c = seq_utilities.standardize_contig_names(seqs,base_ID) seqs = new_contigs #Precise manipulation of single contig if length is None: length = 0 if coverage is None: coverage = 0 ##always SPADES print("Removing low quality contigs from SKESA assembly. Length < {}; coverage < {}".format(length,coverage)) # raw_filename = os.path.join(os.path.dirname(report_file),os.path.basename(assembly_file)) discard_file = utilities.appendToFilename(output_file, '_discarded') ##ext is same as assembly file updated_seqs = cleanup_SKESA(seqs,minimum_length = length, minimum_coverage = coverage,discard_file=discard_file,export_contig_graph=image_file) if updated_seqs is None: print("Unable to clean and orient the assembly: \n\t"+assembly_file) return 1 else: print("Retained {} of {} contigs.".format(len(updated_seqs),len(seqs))) with open(output_file,'wt') as fout: SeqIO.write(updated_seqs,fout,output_format) print('Saved reoriented assembly at {}'.format(output_file)) if output_compressed: print("Warning. Compression not implemented. The file extension is misleading") return 0
def cleanup_SPADES(contigs, minimum_length, minimum_coverage, export_contig_data=None, discard_file=None, export_contig_graph=None): contig_table = AssemblyStats.parse_SPADES( contigs, export_contig_graph=export_contig_graph, export_contig_data=export_contig_data) good_length = contig_table['Contig_Size'] > minimum_length if (contig_table.Coverage.isnull().any()): raise Exception( "Null value in coverage table during Assembly Cleanup. Unable to filter" ) good_contig_bool = good_length else: good_coverage = contig_table['Coverage'] > minimum_coverage good_contig_bool = good_coverage & good_length good_contig_table = contig_table[good_contig_bool] good_contigs = good_contig_table['Contig'].tolist() if discard_file is not None: try: discard_contigs = contig_table[~good_contig_bool]['Contig'].tolist( ) SeqIO.write(discard_contigs, discard_file, utilities.guessFileFormat(discard_file)[0]) except Exception as e: print(e) raise return good_contigs
def seq_guess_and_write(seqs, filename): seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'wt') as seq_out: SeqIO.write(seqs, seq_out, seq_format) else: print("Cannot infer sequence format for file: " + filename)
def main(): ### Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-p','--primer_file',help='Location of primer information',required=True) parser.add_argument('-r','--repository',help='Location of genome assembly repository') parser.add_argument('--keep_temp',action='store_true',help='Keep temporary BLAST files') parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(script_version,script_subversion)) parser.add_argument('args', nargs=argparse.REMAINDER) args = parser.parse_args() argv = [os.path.basename(__file__)] + args.args # stdout = utilities.Logger(os.path.join(_outDir,"LocusExtractor.log")) repository = None default_settings = os.path.join(os.path.dirname(__file__),genomeOrganizer.SETTING_FILE) if args.repository: repository = args.repository gd = genomeOrganizer.placeAssembliesIntoDataFrame(argv,GO_settings=default_settings,repository=repository) if gd is not None: primer_file = args.primer_file extractor = AmpliconExtractor(primer_file,generate_output=True) logFile = os.path.join(extractor.outDir,"AmpliconExtractor.log") if extractor.outDir is not None else "AmpliconExtractor.log" ##TODO find a better default location sys.stdout = utilities.Logger(logFile) if extractor.outDir is not None: utilities.safeOverwriteTable(genomeOrganizer.default_list(extractor.outDir), gd, 'tab') for _,row in gd.iterrows(): (file_format,compressed) = utilities.guessFileFormat(row.loc['Filename']) extractor.evaluateGenome(row.loc['Lab_ID'],row.loc['Filename'],file_format,compressed,keep_temp=args.keep_temp) ## If I have to sort the columns somewhat (from LocusExtractor -- should be a function) # cols = self.allele_table_columns_initial + [c.strip() for c in column_order if c not in self.allele_table_columns_initial] # remainder = [c.strip() for c in self.allele_table.columns.tolist() if c not in cols] # remainder.sort(key=lambda s: s.lower()) # cols += remainder extractor.finish(keep_temp=args.keep_temp) print("Finished. Results saved at {}".format(extractor.outDir))
def seqs_guess_and_parse2list(filename): seq = None seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'rt') as seq_in: seq = [x for x in SeqIO.parse(seq_in, seq_format)] else: print("Cannot infer sequence format for file: " + filename) return seq
def seqs_guess_and_parse2dict(filename): if not isinstance(filename, str): raise TypeError("Filename must be string, is {}".format( type(filename))) # if not os.path.isfile(filename): # raise ValueError("Cannot locate file: {}".format(filename)) seq_dict = None seq_format, compressed = utilities.guessFileFormat(filename) if seq_format is not None: with utilities.flexible_handle(filename, compressed, 'rt') as seq_in: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_in, seq_format)) else: print("Cannot infer sequence format for file: " + filename) return seq_dict
def describeSequences(sequenceFile): result = defaultdict(int) result['FileSize'] = os.path.getsize(sequenceFile) seq_format, compressed = utilities.guessFileFormat( sequenceFile) ##guess and parse if seq_format is not None: with utilities.flexible_handle(sequenceFile, compressed, 'rt') as seq_in: for s in SeqIO.parse(seq_in, seq_format): result['Sequences'] += 1 result['Nucleotides'] += len(s) else: print("Cannot infer sequence format for file: " + sequenceFile) ##TODO: add Q30 and such? return result
def cleanupAndWrite(assembly_file, output_file, circle_new_start=None, reverse_contig=None, closed_circle=None, broken_circle=None, circularize_with_Ns=0, length=None, coverage=None, report_file=None, reference=None, assembler=None, working_dir=None): ##Note: no sanity checks ## Load the assemblies assembly_format, assembly_compressed = utilities.guessFileFormat( assembly_file) output_format, output_compressed = utilities.guessFileFormat(output_file) if assembly_format != output_format: print( "Warning on cleanup: input and output formats do not match ({} and {})" .format(assembly_format, output_format)) with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin: seqs = [c for c in SeqIO.parse(fin, assembly_format)] #Precise manipulation of single contig updated_seqs = None if circle_new_start or reverse_contig: if len(seqs) > 1: print( "Error: User provided explicit reorientation instructions for a contig, but multiple contigs are present in assembly: \n" + assembly_file) return 1 elif closed_circle: print("Shifting closed circle...") updated_seqs = shiftCirclarChromosome(seqs[0], circle_new_start, reverse_contig, N_padding=0) elif broken_circle: print("Shifting broken circle...") updated_seqs = shiftCirclarChromosome(seqs[0], circle_new_start, reverse_contig, N_padding=-1) elif circularize_with_Ns > 0: print('Scaffolding not implemented') else: print( 'To shift a chromosome, you must specify whether the circle is closed or broken' ) else: ## Complex criteria for manipulation if closed_circle and len(seqs) > 1: print( "Warning: Untested parameters. User specified 'closed circle' but multiple contigs are present in assembly" ) ## Remove the low-quality contigs: ##TODO: consider if another parameter should be passed. At least specify if it came from SPAdes circular = closed_circle or broken_circle ##Circles imply high-quality sequence if not circular: if length is None: length = 0 if coverage is None: coverage = 0 if assembler is None: print("Removing short contigs from assembly.") updated_seqs = [x for x in seqs if len(x) > length] # if coverage elif assembler.upper() == 'SPADES': print( "Removing low quality contigs from SPADES assembly. Length < {}; coverage < {}" .format(length, coverage)) raw_filename = os.path.join(os.path.dirname(report_file), os.path.basename(assembly_file)) image_file = None # utilities.setExt(raw_filename, 'png') ##Note: this has been moved to the calculateStats routine discard_file = utilities.appendToFilename( raw_filename, '_discarded') ##ext is same as assembly file updated_seqs = cleanup_SPADES(seqs, minimum_length=length, minimum_coverage=coverage, export_contig_data=report_file, discard_file=discard_file, export_contig_graph=image_file) else: print( "Error: assembler ({}) unknown for non-circular assembly. Not attempting to cleanup contigs in file: \n{}" .format(assembler, assembly_file)) return 1 ## Reorient to reference if requested if reference: input_seqs = updated_seqs if updated_seqs is not None else seqs if os.path.isfile(reference): if circular: if len(input_seqs) > 1: print( 'Warning: multiple contigs in "circular" assembly. Only one contig will be reoriented and I cannot tell you which one. Untested.' ) if len(input_seqs) > 0: N_padding = -1 ##Do not religate if closed_circle: N_padding = 0 elif circularize_with_Ns > 0: print('Scaffolding not implemented') return 1 print( "Reorienting circular chromosome to reference...") updated_seqs = reorientClosedChromosome( input_seqs, reference, N_padding=N_padding, working_dir=working_dir ) #Note: only treated as closed if N_padding >= 0 else: ## Len == 0 print( "None of {} contigs passed your exclusion criteria. Exiting " .format(len(seqs))) return 1 else: if working_dir is None: working_dir = os.path.splitext(output_file)[0] draft_name = os.path.splitext( os.path.basename(assembly_file))[0] print("Reorienting contigs") reorder_stats = reorientContigs( input_seqs, reference, working_dir, name=draft_name, input_format=assembly_format) ##Will be genbank format if isinstance(reorder_stats, dict) and ('ReorderedDraft' in reorder_stats): updated_seqs = seq_utilities.seqs_guess_and_parse2list( reorder_stats['ReorderedDraft'] ) ##Excessive to reload... but it fits in this flow else: updated_seqs = None else: print( "Unable to realign to reference because there is no refernce file: {}" .format(reference)) if updated_seqs is None: print("Unable to clean and orient the assembly: \n\t" + assembly_file) return 1 else: with open(output_file, 'wt') as fout: SeqIO.write(updated_seqs, fout, output_format) print('Saved cleaned assembly at {}'.format(output_file)) if output_compressed: print( "Warning. Compression not implemented. The file extension is misleading" ) return 0
def calculateStats(filelist_or_frame,out_file=None,ass_format=None,image_dir=None,save_details=False): if isinstance(filelist_or_frame,list): filelist = filelist_or_frame fileframe = None elif isinstance(filelist_or_frame,pd.DataFrame): filelist = filelist_or_frame.Filename fileframe = filelist_or_frame else: raise ValueError("can only calculate stats on a list of filenames or a DataFrame with a Filename field") if len(filelist) == 0: raise ValueError("AssemblyStats CalculateStats requires a list of files with length > 0. Contact developer") assFrame = None if isinstance(image_dir,str): utilities.safeMakeDir(image_dir) if len(filelist) > 0: assemblyList = [] for filename in filelist: if isinstance(ass_format,str): assembler = ass_format elif ('spades' in filename): assembler = 'spades' print("Guessing assembler as {}".format(assembler)) elif ('skesa' in filename): assembler = 'skesa' print("Guessing assembler as {}".format(assembler)) else: assembler = None genome_format,_ = utilities.guessFileFormat(filename) AssInfo = {'Filename':filename} ##This will report data for all files provided. Junk files will have 0 contigs and 0 size if genome_format is None: AssInfo['Note']='Could not identify genome format' else: try: contig_list = seq_utilities.seqs_guess_and_parse2list(filename) if isinstance(contig_list,list) and len(contig_list) > 0: contigFrame = getContigStats(contig_list,hasQual = (genome_format == 'fastq'),assembler=assembler) if 'Coverage' in contigFrame.columns: contigFrame['Coverage'] = contigFrame['Coverage'].astype(float) ##Note: Coverage is being cast to float in getSpadesStats, but somehow becomes string in this frame. if 'Contig_Size' in contigFrame.columns: contigFrame['Contig_Size'] = contigFrame['Contig_Size'].astype(int) if save_details: contig_file = utilities.setExt(utilities.appendToFilename(filename,'_contigs'),'.xlsx') contigFrame.to_excel(contig_file) assert len(contig_list) == len(contigFrame), "Not all contigs are in dataframe" if isinstance(image_dir,str) and os.path.isdir(image_dir): if has_plt: if ('Coverage' in contigFrame.columns) and ('Contig_Size' in contigFrame.columns): tempFrame = contigFrame[['Coverage','Contig_Size']].copy() try: raw_filename = os.path.join(image_dir,os.path.basename(filename)) image_file = utilities.setExt(raw_filename, 'png') ##Note: only reason to do if isinstance(image_file,str): fig = tempFrame.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True) fig = fig.get_figure() fig.savefig(image_file) except Exception as e: print('Failed to save contig stats scatterplot at '+image_file) for c in tempFrame.columns: print(tempFrame[c]) utilities.printExceptionDetails(e) else: try: plt.close(fig) except: print("Failed to close image...") elif assembler in ['skesa','spades']: print("Unable to produce contig stats scatterplot because necessary fields are not present ('Contig_Size' and 'Coverage')") AssInfo['Contig_Count']=str(len(contig_list)) contigSizes = contigFrame['Contig_Size'].astype(int) assemblySize = sum(contigSizes) AssInfo['Bases_In_Contigs'] = str(assemblySize) largeContigs = contigSizes > 10000 AssInfo['Large_Contig_Count'] = str(sum(largeContigs)) AssInfo['Small_Contig_Count'] = str(sum(~largeContigs)) AssInfo['Bases_In_Large_Contigs'] = str(sum(contigSizes[largeContigs])) AssInfo['Bases_In_Small_Contigs'] = str(sum(contigSizes[~largeContigs])) emptyContigs = contigSizes == 0 if sum(emptyContigs) > 0: print('\n#### WARNING #### EMPTY CONTIGS ########\n') print('\n\t'.join(contigFrame[emptyContigs].Contig_Name.tolist())) print('\n########################################\n') if 'Coverage' in contigFrame.columns: contigCoverage = contigFrame['Coverage'] ##should be float, but seems to get converted to a string with some versions if len(contigCoverage[largeContigs]) > 0: min_c = min(contigCoverage[largeContigs]) AssInfo['Min_Coverage_Large_Contigs'] = str(min_c) max_c = max(contigCoverage[largeContigs]) AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = '{:0.2f}'.format(max_c/min_c) lowC_contigs = contigFrame['Coverage'] < (min_c / 2) AssInfo['Low_Coverage_Contig_Count'] = sum(lowC_contigs) AssInfo['Low_Coverage_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size']) else: AssInfo['Min_Coverage_Large_Contigs'] = 'N/A' AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = 'N/A' AssInfo['Low_Coverage_Contig_Count'] = 'N/A' AssInfo['Low_Coverage_Contig_Bases'] = 'N/A' coverageProduct = contigFrame['Contig_Size'].astype(int) * contigFrame['Coverage'] coverageProductSum = sum(coverageProduct) meanCoverage = coverageProductSum/assemblySize AssInfo['Mean_Coverage'] = meanCoverage lowC_contigs = contigFrame['Coverage'] < (meanCoverage / 2) AssInfo['HalfCov_Contig_Count'] = sum(lowC_contigs) AssInfo['HalfCov_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size']) if feature_head in contigFrame: featureCounts = contigFrame[feature_head].astype(int) AssInfo[feature_head] = sum(featureCounts) ### Sum ambiguous nucleotides ambigCounts = contigFrame['Ambiguous_nucleotides'].astype(int) AssInfo['Ambiguous_nucleotides']=sum(ambigCounts) ## Import the quality scores for c in contigFrame.columns: if c.startswith(quality_head): AssInfo[c] = str(sum(contigFrame[c])) ##Calculate N50 and N90 N_stats = calcN50_stats(contigSizes.tolist(),thresholds=[50,75,90]) for n,size in N_stats.items(): header = "N{}".format(n) AssInfo[header] = str(size) # assemblyList.append(AssInfo) else: print("failed to parse file: "+filename) AssInfo['Note'] = 'No sequences parsed from file' except Exception as e: print("Warning: failed to assess file: " + filename) print("Exception: {}".format(e)) raise if 'Bases_In_Contigs' not in AssInfo: AssInfo['Bases_In_Contigs'] = 0 if 'Contig_Count' not in AssInfo: AssInfo['ContigCount'] = 0 assemblyList.append(AssInfo) if len(assemblyList) > 0: print("Stats for {} assemblies.".format(len(assemblyList))) assFrame = pd.DataFrame(assemblyList) if isinstance(fileframe,pd.DataFrame): saveFrame = pd.merge(fileframe,assFrame,on='Filename') else: saveFrame = assFrame.set_index('Filename') if (out_file is not None): try: saveFrame.to_csv(out_file) except Exception as e: print(saveFrame.to_csv()) print() print("Failed to print to target file {}. \nPrinted results to screen (above)".format(out_file)) utilities.printExceptionDetails(e) else: print("Failed to evaluate assemblies...") print("attempted to evaluate the following files:"+"\n".join(filelist)) return assFrame