def reporter(self): """ Creates the metadata report by pulling specific attributes from the metadata objects """ printtime('Creating summary report', self.starttime) header = '{}\n'.format(','.join(self.headers)) # Create a string to store all the results data = str() for sample in self.metadata: # Add the value of the appropriate attribute to the results string data += GenObject.returnattr(sample, 'name') # SampleName data += GenObject.returnattr(sample.run, 'SamplePlate') # Genus data += GenObject.returnattr(sample.sixteens_full, 'genus') # SequencingDate data += GenObject.returnattr(sample.run, 'Date') # Analyst data += GenObject.returnattr(sample.run, 'InvestigatorName') # SamplePurity data += GenObject.returnattr(sample.confindr, 'contam_status') # GenomeQAML prediction prediction = GenObject.returnattr(sample.GenomeQAML, 'prediction') if prediction != ',': data += prediction else: try: description = sample.run.Description if description == 'metagenome': data += '{description},'.format( description=description) else: data += '{status},'.format(status=sample.run.status) except KeyError: data += 'ND,' # N50 n50 = GenObject.returnattr(sample.quality_features_polished, 'n50') if n50 != '-,': data += n50 else: data += 'ND,' # NumContigs data += GenObject.returnattr(sample.quality_features_polished, 'num_contigs') # TotalLength data += GenObject.returnattr(sample.quality_features_polished, 'genome_length') # MeanInsertSize data += GenObject.returnattr(sample.mapping, 'MeanInsertSize') # InsertSizeSTD data += GenObject.returnattr(sample.mapping, 'StdInsertSize') # AverageCoverageDepth data += GenObject.returnattr(sample.mapping, 'MeanCoveragedata') # CoverageDepthSTD data += GenObject.returnattr(sample.mapping, 'StdCoveragedata') # PercentGC data += GenObject.returnattr(sample.quality_features_polished, 'gc') # MASH_ReferenceGenome data += GenObject.returnattr(sample.mash, 'closestrefseq') # MASH_NumMatchingHashes data += GenObject.returnattr(sample.mash, 'nummatches') # 16S_result data += GenObject.returnattr(sample.sixteens_full, 'sixteens_match') # rMLST_Result try: # If the number of matches to the closest reference profile is 53, return the profile number if sample.rmlst.matches == 53: data += GenObject.returnattr(sample.rmlst, 'sequencetype') else: # Create a set of all the genes present in the results (gene name split from allele) rmlst_gene_set = { gene.split('_')[0] for gene in sample.rmlst.results } # If there are a full set of 53 genes, but no profile match, then this is a new profile if len(rmlst_gene_set) == 53: data += 'new,' # Otherwise the profile is set to ND else: data += 'ND,' except KeyError: data += 'ND,' # MLST_Result try: if sample.mlst.matches == 7: data += GenObject.returnattr(sample.mlst, 'sequencetype') else: # Create a set of all the genes present in the results (gene name split from allele) mlst_gene_set = { gene.split('_')[0] for gene in sample.mlst.results } # If there are all the genes present, but no perfect match to a reference profile, state that # the profile is new if len(mlst_gene_set) == 7: data += 'new,' # Otherwise indicate that the profile is ND else: data += 'ND,' except KeyError: data += 'ND,' # MLST_gene_X_alleles try: # Create a set of all the genes present in the results (gene name split from allele) gene_set = {gene.split('_')[0] for gene in sample.mlst.results} for gene in sorted(gene_set): allele_list = list() # Determine all the alleles that are present for each gene for allele in sample.mlst.results: if gene in allele: allele_list.append(allele) # If there is more than one allele in the sample, add both to the string separated by a ';' if len(allele_list) > 1: data += '{},'.format(';'.join(allele_list)) # Otherwise add the only allele else: data += allele_list[0] + ',' # If there are fewer than seven matching alleles, add a ND for each missing result if len(gene_set) < 7: data += (7 - len(gene_set)) * 'ND,' except KeyError: # data += '-,-,-,-,-,-,-,' data += 'ND,ND,ND,ND,ND,ND,ND,' # CoreGenesPresent data += GenObject.returnattr(sample.coregenome, 'coreresults') # E_coli_Serotype try: serotype = '{oset} ({opid}):{hset} ({hpid}),'\ .format(oset=';'.join(sample.serosippr.o_set), opid=sample.serosippr.best_o_pid, hset=';'.join(sample.serosippr.h_set), hpid=sample.serosippr.best_h_pid) # Make sure that the string was populated with values rather than 'NA' or '-' if serotype == '- (-):- (-),': data += 'ND,' else: data += serotype except KeyError: data += 'ND,' # SISTR_serovar_antigen data += GenObject.returnattr(sample.sistr, 'serovar_antigen').rstrip(';') # SISTR_serovar_cgMLST data += GenObject.returnattr(sample.sistr, 'serovar_cgmlst') # SISTR_serogroup data += GenObject.returnattr(sample.sistr, 'serogroup') # SISTR_h1 data += GenObject.returnattr(sample.sistr, 'h1').rstrip(';') # SISTR_h2 data += GenObject.returnattr(sample.sistr, 'h2').rstrip(';') # SISTR_serovar data += GenObject.returnattr(sample.sistr, 'serovar') # GeneSeekr_Profile try: if sample.genesippr.report_output: data += ';'.join(sample.genesippr.report_output) + ',' else: data += 'ND,' except KeyError: data += 'ND,' # Vtyper_Profile try: # Since the vtyper attribute can be empty, check first profile = sorted(sample.vtyper.profile) if profile: data += ';'.join(profile) + ',' else: data += 'ND,' except KeyError: data += 'ND,' # AMR_Profile and resistant/sensitive status if sample.resfinder_assembled.pipelineresults: # Profile data += ';'.join( sorted(sample.resfinder_assembled.pipelineresults)) + ',' # Resistant/Sensitive data += 'Resistant,' else: # Profile data += 'ND,' # Resistant/Sensitive data += 'Sensitive,' # Plasmid Result' try: plasmid_profile = sorted(sample.plasmidextractor.plasmids) if plasmid_profile: data += ';'.join(plasmid_profile) + ',' else: data += 'ND,' except KeyError: data += 'ND,' # TotalPredictedGenes data += GenObject.returnattr(sample.prodigal, 'predictedgenestotal') # PredictedGenesOver3000bp data += GenObject.returnattr(sample.prodigal, 'predictedgenesover3000bp') # PredictedGenesOver1000bp data += GenObject.returnattr(sample.prodigal, 'predictedgenesover1000bp') # PredictedGenesOver500bp data += GenObject.returnattr(sample.prodigal, 'predictedgenesover500bp') # PredictedGenesUnder500bp data += GenObject.returnattr(sample.prodigal, 'predictedgenesunder500bp') # NumClustersPF data += GenObject.returnattr(sample.run, 'NumberofClustersPF') # Percent of reads mapping to PhiX control data += GenObject.returnattr(sample.run, 'phix_aligned') # Error rate calculated from PhiX control data += GenObject.returnattr(sample.run, 'error_rate') # LengthForwardRead data += GenObject.returnattr(sample.run, 'forwardlength') # LengthReverseRead data += GenObject.returnattr(sample.run, 'reverselength') # Real time strain data += GenObject.returnattr(sample.run, 'Description') # Flowcell data += GenObject.returnattr(sample.run, 'flowcell') # MachineName data += GenObject.returnattr(sample.run, 'instrument') # PipelineVersion data += self.commit + ',' # AssemblyDate data += datetime.now().strftime('%Y-%m-%d') # Append a new line to the end of the results for this sample data += '\n' # Replace any NA values with - cleandata = data.replace('NA', 'ND') with open(os.path.join(self.reportpath, 'combinedMetadata.csv'), 'w') as metadatareport: metadatareport.write(header) metadatareport.write(cleandata)
def legacy_reporter(self): """ Creates an output that is compatible with the legacy metadata reports. This method will be removed once a new database scheme is implemented """ from collections import OrderedDict printtime('Creating legacy summary report', self.starttime) row = '' # Create a dictionary of tuples to be printed in the final report for sample in self.metadata: data = OrderedDict([ ('SampleName', sample.name), ('N50', str(sample.quality_features_polished.n50)), ('NumContigs', str(sample.quality_features_polished.num_contigs)), ('TotalLength', str(sample.quality_features_polished.genome_length)), ('MeanInsertSize', sample.mapping.MeanInsertSize), ('AverageCoverageDepth', sample.mapping.MeanCoveragedata.split("X")[0]), ('ReferenceGenome', sample.mash.closestrefseq), ('RefGenomeAlleleMatches', '-'), ('16sPhylogeny', sample.sixteens_full.genus), ('rMLSTsequenceType', sample.rmlst.sequencetype), ('MLSTsequencetype', sample.mlst.sequencetype), ('MLSTmatches', str(sample.mlst.matchestosequencetype)), ('coreGenome', GenObject.returnattr(sample.coregenome, 'coreresults').rstrip(',')), ('SeroType', '{oset}:{hset}'.format( oset=';'.join(sample.serosippr.o_set), hset=';'.join(sample.serosippr.h_set))), ('geneSeekrProfile', ';'.join(result for result, pid in sorted( sample.genesippr.results.items()))), ('vtyperProfile', ';'.join(sorted(sample.vtyper.profile))), ('percentGC', str(sample.quality_features_polished.gc)), ('TotalPredictedGenes', str(sample.prodigal.predictedgenestotal)), ('predictedgenesover3000bp', str(sample.prodigal.predictedgenesover3000bp)), ('predictedgenesover1000bp', str(sample.prodigal.predictedgenesover1000bp)), ('predictedgenesover500bp', str(sample.prodigal.predictedgenesover500bp)), ('predictedgenesunder500bp', str(sample.prodigal.predictedgenesunder500bp)), ('SequencingDate', sample.run.Date), ('Investigator', sample.run.InvestigatorName), ('TotalClustersinRun', str(sample.run.TotalClustersinRun)), ('NumberofClustersPF', str(sample.run.NumberofClustersPF)), ('PercentOfClusters', str(sample.run.PercentOfClusters)), ('LengthofForwardRead', str(sample.run.forwardlength)), ('LengthofReverseRead', str(sample.run.reverselength)), ('Project', str(sample.run.SampleProject)), ('PipelineVersion', self.commit) ]) if not row: row += ','.join([key for key, value in data.items()]) row += '\n' row += ','.join([value for key, value in data.items()]) cleanrow = row.replace('NA', '').replace(',-,', ',,') with open(os.path.join(self.reportpath, 'legacy_combinedMetadata.csv'), 'w') as metadatareport: metadatareport.write(cleanrow)