Exemple #1
0
def cleanupAndWrite(assembly_file,output_file,length=None,coverage=None,image_file=None,base_ID=None):
        ##Note: no sanity checks
    ## Load the assemblies
    assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file)
    output_format,output_compressed = utilities.guessFileFormat(output_file)
    if assembly_format != output_format:
        print("Warning on cleanup: input and output formats do not match ({} and {})".format(assembly_format,output_format))
    with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin:
        seqs = [c for c in SeqIO.parse(fin,assembly_format)]
    if base_ID is not None:
        new_contigs, c = seq_utilities.standardize_contig_names(seqs,base_ID)
        seqs = new_contigs
    #Precise manipulation of single contig
    if length is None:
        length = 0
    if coverage is None:
        coverage = 0
    ##always SPADES
    print("Removing low quality contigs from SKESA assembly. Length < {}; coverage < {}".format(length,coverage))
#     raw_filename = os.path.join(os.path.dirname(report_file),os.path.basename(assembly_file))
    discard_file = utilities.appendToFilename(output_file, '_discarded') ##ext is same as assembly file
    updated_seqs = cleanup_SKESA(seqs,minimum_length = length, minimum_coverage = coverage,discard_file=discard_file,export_contig_graph=image_file)
    if updated_seqs is None:
        print("Unable to clean and orient the assembly: \n\t"+assembly_file)
        return 1      
    else:
        print("Retained {} of {} contigs.".format(len(updated_seqs),len(seqs)))
        with open(output_file,'wt') as fout:
            SeqIO.write(updated_seqs,fout,output_format)
        print('Saved reoriented assembly at {}'.format(output_file))
        if output_compressed:
            print("Warning. Compression not implemented. The file extension is misleading")
        return 0
Exemple #2
0
def cleanup_SPADES(contigs,
                   minimum_length,
                   minimum_coverage,
                   export_contig_data=None,
                   discard_file=None,
                   export_contig_graph=None):
    contig_table = AssemblyStats.parse_SPADES(
        contigs,
        export_contig_graph=export_contig_graph,
        export_contig_data=export_contig_data)
    good_length = contig_table['Contig_Size'] > minimum_length
    if (contig_table.Coverage.isnull().any()):
        raise Exception(
            "Null value in coverage table during Assembly Cleanup. Unable to filter"
        )
        good_contig_bool = good_length
    else:
        good_coverage = contig_table['Coverage'] > minimum_coverage
        good_contig_bool = good_coverage & good_length
    good_contig_table = contig_table[good_contig_bool]
    good_contigs = good_contig_table['Contig'].tolist()
    if discard_file is not None:
        try:
            discard_contigs = contig_table[~good_contig_bool]['Contig'].tolist(
            )
            SeqIO.write(discard_contigs, discard_file,
                        utilities.guessFileFormat(discard_file)[0])
        except Exception as e:
            print(e)
            raise
    return good_contigs
Exemple #3
0
def seq_guess_and_write(seqs, filename):
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'wt') as seq_out:
            SeqIO.write(seqs, seq_out, seq_format)
    else:
        print("Cannot infer sequence format for file: " + filename)
Exemple #4
0
def main():
    ### Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-p','--primer_file',help='Location of primer information',required=True)
    parser.add_argument('-r','--repository',help='Location of genome assembly repository')
    parser.add_argument('--keep_temp',action='store_true',help='Keep temporary BLAST files')
    parser.add_argument('--version','-V',action='version',version='%(prog)s {}.{}'.format(script_version,script_subversion))
    parser.add_argument('args', nargs=argparse.REMAINDER)
    args = parser.parse_args()
    argv = [os.path.basename(__file__)] + args.args  
#     stdout = utilities.Logger(os.path.join(_outDir,"LocusExtractor.log"))
    repository = None
    default_settings = os.path.join(os.path.dirname(__file__),genomeOrganizer.SETTING_FILE)
    if args.repository:
        repository = args.repository  
    gd = genomeOrganizer.placeAssembliesIntoDataFrame(argv,GO_settings=default_settings,repository=repository)
    if gd is not None:
        primer_file = args.primer_file
        extractor = AmpliconExtractor(primer_file,generate_output=True)
        logFile = os.path.join(extractor.outDir,"AmpliconExtractor.log") if extractor.outDir is not None else "AmpliconExtractor.log" ##TODO find a better default location
        sys.stdout = utilities.Logger(logFile)
        if extractor.outDir is not None:
            utilities.safeOverwriteTable(genomeOrganizer.default_list(extractor.outDir), gd, 'tab')        
        for _,row in gd.iterrows():
            (file_format,compressed) = utilities.guessFileFormat(row.loc['Filename'])
            extractor.evaluateGenome(row.loc['Lab_ID'],row.loc['Filename'],file_format,compressed,keep_temp=args.keep_temp)
##    If I have to sort the columns somewhat (from LocusExtractor -- should be a function)
#     cols = self.allele_table_columns_initial + [c.strip() for c in column_order if c not in self.allele_table_columns_initial]
#     remainder = [c.strip() for c in self.allele_table.columns.tolist() if c not in cols]
#     remainder.sort(key=lambda s: s.lower())
#     cols += remainder            
        extractor.finish(keep_temp=args.keep_temp)
        print("Finished. Results saved at {}".format(extractor.outDir))
Exemple #5
0
def seqs_guess_and_parse2list(filename):
    seq = None
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'rt') as seq_in:
            seq = [x for x in SeqIO.parse(seq_in, seq_format)]
    else:
        print("Cannot infer sequence format for file: " + filename)
    return seq
Exemple #6
0
def seqs_guess_and_parse2dict(filename):
    if not isinstance(filename, str):
        raise TypeError("Filename must be string, is {}".format(
            type(filename)))
#     if not os.path.isfile(filename):
#         raise ValueError("Cannot locate file: {}".format(filename))
    seq_dict = None
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'rt') as seq_in:
            seq_dict = SeqIO.to_dict(SeqIO.parse(seq_in, seq_format))
    else:
        print("Cannot infer sequence format for file: " + filename)
    return seq_dict
Exemple #7
0
def describeSequences(sequenceFile):
    result = defaultdict(int)
    result['FileSize'] = os.path.getsize(sequenceFile)
    seq_format, compressed = utilities.guessFileFormat(
        sequenceFile)  ##guess and parse
    if seq_format is not None:
        with utilities.flexible_handle(sequenceFile, compressed,
                                       'rt') as seq_in:
            for s in SeqIO.parse(seq_in, seq_format):
                result['Sequences'] += 1
                result['Nucleotides'] += len(s)
    else:
        print("Cannot infer sequence format for file: " + sequenceFile)
    ##TODO: add Q30 and such?
    return result
Exemple #8
0
def cleanupAndWrite(assembly_file,
                    output_file,
                    circle_new_start=None,
                    reverse_contig=None,
                    closed_circle=None,
                    broken_circle=None,
                    circularize_with_Ns=0,
                    length=None,
                    coverage=None,
                    report_file=None,
                    reference=None,
                    assembler=None,
                    working_dir=None):
    ##Note: no sanity checks
    ## Load the assemblies
    assembly_format, assembly_compressed = utilities.guessFileFormat(
        assembly_file)
    output_format, output_compressed = utilities.guessFileFormat(output_file)
    if assembly_format != output_format:
        print(
            "Warning on cleanup: input and output formats do not match ({} and {})"
            .format(assembly_format, output_format))
    with utilities.flexible_handle(assembly_file, assembly_compressed,
                                   'rt') as fin:
        seqs = [c for c in SeqIO.parse(fin, assembly_format)]
    #Precise manipulation of single contig
    updated_seqs = None
    if circle_new_start or reverse_contig:
        if len(seqs) > 1:
            print(
                "Error: User provided explicit reorientation instructions for a contig, but multiple contigs are present in assembly: \n"
                + assembly_file)
            return 1
        elif closed_circle:
            print("Shifting closed circle...")
            updated_seqs = shiftCirclarChromosome(seqs[0],
                                                  circle_new_start,
                                                  reverse_contig,
                                                  N_padding=0)
        elif broken_circle:
            print("Shifting broken circle...")
            updated_seqs = shiftCirclarChromosome(seqs[0],
                                                  circle_new_start,
                                                  reverse_contig,
                                                  N_padding=-1)
        elif circularize_with_Ns > 0:
            print('Scaffolding not implemented')
        else:
            print(
                'To shift a chromosome, you must specify whether the circle is closed or broken'
            )
    else:  ## Complex criteria for manipulation
        if closed_circle and len(seqs) > 1:
            print(
                "Warning: Untested parameters. User specified 'closed circle' but multiple contigs are present in assembly"
            )

        ## Remove the low-quality contigs:
        ##TODO: consider if another parameter should be passed. At least specify if  it came from SPAdes
        circular = closed_circle or broken_circle  ##Circles imply high-quality sequence
        if not circular:
            if length is None:
                length = 0
            if coverage is None:
                coverage = 0
            if assembler is None:
                print("Removing short contigs from assembly.")
                updated_seqs = [x for x in seqs if len(x) > length]


#                 if coverage
            elif assembler.upper() == 'SPADES':
                print(
                    "Removing low quality contigs from SPADES assembly. Length < {}; coverage < {}"
                    .format(length, coverage))
                raw_filename = os.path.join(os.path.dirname(report_file),
                                            os.path.basename(assembly_file))
                image_file = None  # utilities.setExt(raw_filename, 'png') ##Note: this has been moved to the calculateStats routine
                discard_file = utilities.appendToFilename(
                    raw_filename, '_discarded')  ##ext is same as assembly file
                updated_seqs = cleanup_SPADES(seqs,
                                              minimum_length=length,
                                              minimum_coverage=coverage,
                                              export_contig_data=report_file,
                                              discard_file=discard_file,
                                              export_contig_graph=image_file)
            else:
                print(
                    "Error: assembler ({}) unknown for non-circular assembly. Not attempting to cleanup contigs in file: \n{}"
                    .format(assembler, assembly_file))
                return 1
        ## Reorient to reference if requested
        if reference:
            input_seqs = updated_seqs if updated_seqs is not None else seqs
            if os.path.isfile(reference):
                if circular:
                    if len(input_seqs) > 1:
                        print(
                            'Warning: multiple contigs in "circular" assembly. Only one contig will be reoriented and I cannot tell you which one. Untested.'
                        )
                    if len(input_seqs) > 0:
                        N_padding = -1  ##Do not religate
                        if closed_circle:
                            N_padding = 0
                        elif circularize_with_Ns > 0:
                            print('Scaffolding not implemented')
                            return 1
                        print(
                            "Reorienting circular chromosome to reference...")
                        updated_seqs = reorientClosedChromosome(
                            input_seqs,
                            reference,
                            N_padding=N_padding,
                            working_dir=working_dir
                        )  #Note: only treated as closed if N_padding >= 0
                    else:  ## Len == 0
                        print(
                            "None of {} contigs passed your exclusion criteria. Exiting "
                            .format(len(seqs)))
                        return 1
                else:
                    if working_dir is None:
                        working_dir = os.path.splitext(output_file)[0]
                    draft_name = os.path.splitext(
                        os.path.basename(assembly_file))[0]
                    print("Reorienting contigs")
                    reorder_stats = reorientContigs(
                        input_seqs,
                        reference,
                        working_dir,
                        name=draft_name,
                        input_format=assembly_format)  ##Will be genbank format
                    if isinstance(reorder_stats, dict) and ('ReorderedDraft'
                                                            in reorder_stats):
                        updated_seqs = seq_utilities.seqs_guess_and_parse2list(
                            reorder_stats['ReorderedDraft']
                        )  ##Excessive to reload... but it fits in this flow
                    else:
                        updated_seqs = None

            else:
                print(
                    "Unable to realign to reference because there is no refernce file: {}"
                    .format(reference))
    if updated_seqs is None:
        print("Unable to clean and orient the assembly: \n\t" + assembly_file)
        return 1
    else:
        with open(output_file, 'wt') as fout:
            SeqIO.write(updated_seqs, fout, output_format)
        print('Saved cleaned assembly at {}'.format(output_file))
        if output_compressed:
            print(
                "Warning. Compression not implemented. The file extension is misleading"
            )
        return 0
Exemple #9
0
def calculateStats(filelist_or_frame,out_file=None,ass_format=None,image_dir=None,save_details=False):
    if isinstance(filelist_or_frame,list):
        filelist = filelist_or_frame
        fileframe = None
    elif isinstance(filelist_or_frame,pd.DataFrame):
        filelist = filelist_or_frame.Filename
        fileframe = filelist_or_frame
    else:
        raise ValueError("can only calculate stats on a list of filenames or a DataFrame with a Filename field")
    if len(filelist) == 0:
        raise ValueError("AssemblyStats CalculateStats requires a list of files with length > 0. Contact developer")
    assFrame = None
    if isinstance(image_dir,str):
        utilities.safeMakeDir(image_dir)    
    if len(filelist) > 0:
        assemblyList = []
        for filename in filelist:
            if isinstance(ass_format,str):
                assembler = ass_format
            elif ('spades' in filename):
                assembler = 'spades'
                print("Guessing assembler as {}".format(assembler))
            elif ('skesa' in filename):
                assembler = 'skesa'
                print("Guessing assembler as {}".format(assembler))
            else:
                assembler = None
            genome_format,_ = utilities.guessFileFormat(filename)
            AssInfo = {'Filename':filename} ##This will report data for all files provided. Junk files will have 0 contigs and 0 size
            if genome_format is None:
                AssInfo['Note']='Could not identify genome format'  
            else:
                try:
                    contig_list = seq_utilities.seqs_guess_and_parse2list(filename)                                       
                    if isinstance(contig_list,list) and len(contig_list) > 0:
                        contigFrame = getContigStats(contig_list,hasQual = (genome_format == 'fastq'),assembler=assembler) 
                        if 'Coverage' in contigFrame.columns:
                            contigFrame['Coverage'] = contigFrame['Coverage'].astype(float) ##Note: Coverage is being cast to float in getSpadesStats, but somehow becomes string in this frame.
                        if 'Contig_Size' in contigFrame.columns:
                            contigFrame['Contig_Size'] = contigFrame['Contig_Size'].astype(int)
                        if save_details:
                            contig_file = utilities.setExt(utilities.appendToFilename(filename,'_contigs'),'.xlsx')
                            contigFrame.to_excel(contig_file)
                        assert len(contig_list) == len(contigFrame), "Not all contigs are in dataframe"  
                        if isinstance(image_dir,str) and os.path.isdir(image_dir):
                            if has_plt:
                                if ('Coverage' in contigFrame.columns) and ('Contig_Size' in contigFrame.columns):
                                    tempFrame = contigFrame[['Coverage','Contig_Size']].copy()
                                    try:
                                        raw_filename = os.path.join(image_dir,os.path.basename(filename))
                                        image_file = utilities.setExt(raw_filename, 'png') ##Note: only reason to do                                     
                                        if isinstance(image_file,str):
                                            fig = tempFrame.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True)
                                            fig = fig.get_figure()
                                            fig.savefig(image_file)    
                                    except Exception as e:
                                        print('Failed to save contig stats scatterplot at '+image_file)
                                        for c in tempFrame.columns:
                                            print(tempFrame[c])
                                        utilities.printExceptionDetails(e)
                                    else:
                                        try:
                                            plt.close(fig)
                                        except:
                                            print("Failed to close image...")  
                                elif assembler in ['skesa','spades']:
                                    print("Unable to produce contig stats scatterplot because necessary fields are not present ('Contig_Size' and 'Coverage')")                     
                        AssInfo['Contig_Count']=str(len(contig_list))
                        contigSizes = contigFrame['Contig_Size'].astype(int)
                        assemblySize = sum(contigSizes)                
                        AssInfo['Bases_In_Contigs'] = str(assemblySize)                   
                        largeContigs = contigSizes > 10000
                        AssInfo['Large_Contig_Count'] = str(sum(largeContigs))
                        AssInfo['Small_Contig_Count'] = str(sum(~largeContigs))
                        AssInfo['Bases_In_Large_Contigs'] = str(sum(contigSizes[largeContigs]))
                        AssInfo['Bases_In_Small_Contigs'] = str(sum(contigSizes[~largeContigs]))
                        emptyContigs = contigSizes == 0
                        if sum(emptyContigs) > 0:
                            print('\n#### WARNING #### EMPTY CONTIGS ########\n')                           
                            print('\n\t'.join(contigFrame[emptyContigs].Contig_Name.tolist()))
                            print('\n########################################\n')
                        if 'Coverage' in contigFrame.columns:
                            contigCoverage = contigFrame['Coverage'] ##should be float, but seems to get converted to a string with some versions
                            if len(contigCoverage[largeContigs]) > 0:
                                min_c = min(contigCoverage[largeContigs])
                                AssInfo['Min_Coverage_Large_Contigs'] = str(min_c) 
                                max_c = max(contigCoverage[largeContigs])
                                AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = '{:0.2f}'.format(max_c/min_c) 
                                lowC_contigs = contigFrame['Coverage'] < (min_c / 2)
                                AssInfo['Low_Coverage_Contig_Count'] = sum(lowC_contigs)
                                AssInfo['Low_Coverage_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size'])
                            else:
                                AssInfo['Min_Coverage_Large_Contigs'] =  'N/A'
                                AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = 'N/A' 
                                AssInfo['Low_Coverage_Contig_Count'] = 'N/A'
                                AssInfo['Low_Coverage_Contig_Bases'] = 'N/A'                      
                            coverageProduct = contigFrame['Contig_Size'].astype(int) * contigFrame['Coverage']   
                            coverageProductSum = sum(coverageProduct)    
                            meanCoverage = coverageProductSum/assemblySize
                            AssInfo['Mean_Coverage'] = meanCoverage            
                            lowC_contigs = contigFrame['Coverage'] < (meanCoverage / 2)
                            AssInfo['HalfCov_Contig_Count'] = sum(lowC_contigs)
                            AssInfo['HalfCov_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size'])
                        if feature_head in contigFrame:
                            featureCounts = contigFrame[feature_head].astype(int)
                            AssInfo[feature_head] = sum(featureCounts)
                        ### Sum ambiguous nucleotides
                        ambigCounts = contigFrame['Ambiguous_nucleotides'].astype(int)
                        AssInfo['Ambiguous_nucleotides']=sum(ambigCounts)
                        ## Import the quality scores
                        for c in contigFrame.columns:
                            if c.startswith(quality_head):
                                AssInfo[c] = str(sum(contigFrame[c]))            
                        ##Calculate N50 and N90
                        N_stats = calcN50_stats(contigSizes.tolist(),thresholds=[50,75,90])
                        for n,size in N_stats.items():
                            header = "N{}".format(n)
                            AssInfo[header] = str(size)
#                         assemblyList.append(AssInfo)
                    else:
                        print("failed to parse file: "+filename)
                        AssInfo['Note'] = 'No sequences parsed from file'
                except Exception as e:
                    print("Warning: failed to assess file: " + filename)
                    print("Exception: {}".format(e))
                    raise
             
            if 'Bases_In_Contigs' not in AssInfo:
                AssInfo['Bases_In_Contigs'] = 0 
            if 'Contig_Count' not in AssInfo:
                AssInfo['ContigCount'] = 0
            assemblyList.append(AssInfo)
        if len(assemblyList) > 0:
            print("Stats for {} assemblies.".format(len(assemblyList)))
            assFrame = pd.DataFrame(assemblyList)
            if isinstance(fileframe,pd.DataFrame):
                saveFrame = pd.merge(fileframe,assFrame,on='Filename')
            else:
                saveFrame = assFrame.set_index('Filename')
            if (out_file is not None):
                try:
                    saveFrame.to_csv(out_file)
                except Exception as e:
                    print(saveFrame.to_csv())
                    print()
                    print("Failed to print to target file {}. \nPrinted results to screen (above)".format(out_file))
                    utilities.printExceptionDetails(e)
        else:
            print("Failed to evaluate assemblies...")
            print("attempted to evaluate the following files:"+"\n".join(filelist))
    return assFrame