예제 #1
0
 def reporter(self):
     """
     Creates the metadata report by pulling specific attributes from the metadata objects
     """
     printtime('Creating summary report', self.starttime)
     header = '{}\n'.format(','.join(self.headers))
     # Create a string to store all the results
     data = str()
     for sample in self.metadata:
         # Add the value of the appropriate attribute to the results string
         data += GenObject.returnattr(sample, 'name')
         # SampleName
         data += GenObject.returnattr(sample.run, 'SamplePlate')
         # Genus
         data += GenObject.returnattr(sample.sixteens_full, 'genus')
         # SequencingDate
         data += GenObject.returnattr(sample.run, 'Date')
         # Analyst
         data += GenObject.returnattr(sample.run, 'InvestigatorName')
         # SamplePurity
         data += GenObject.returnattr(sample.confindr, 'contam_status')
         # GenomeQAML prediction
         prediction = GenObject.returnattr(sample.GenomeQAML, 'prediction')
         if prediction != ',':
             data += prediction
         else:
             try:
                 description = sample.run.Description
                 if description == 'metagenome':
                     data += '{description},'.format(
                         description=description)
                 else:
                     data += '{status},'.format(status=sample.run.status)
             except KeyError:
                 data += 'ND,'
         # N50
         n50 = GenObject.returnattr(sample.quality_features_polished, 'n50')
         if n50 != '-,':
             data += n50
         else:
             data += 'ND,'
         # NumContigs
         data += GenObject.returnattr(sample.quality_features_polished,
                                      'num_contigs')
         # TotalLength
         data += GenObject.returnattr(sample.quality_features_polished,
                                      'genome_length')
         # MeanInsertSize
         data += GenObject.returnattr(sample.mapping, 'MeanInsertSize')
         # InsertSizeSTD
         data += GenObject.returnattr(sample.mapping, 'StdInsertSize')
         # AverageCoverageDepth
         data += GenObject.returnattr(sample.mapping, 'MeanCoveragedata')
         # CoverageDepthSTD
         data += GenObject.returnattr(sample.mapping, 'StdCoveragedata')
         # PercentGC
         data += GenObject.returnattr(sample.quality_features_polished,
                                      'gc')
         # MASH_ReferenceGenome
         data += GenObject.returnattr(sample.mash, 'closestrefseq')
         # MASH_NumMatchingHashes
         data += GenObject.returnattr(sample.mash, 'nummatches')
         # 16S_result
         data += GenObject.returnattr(sample.sixteens_full,
                                      'sixteens_match')
         # rMLST_Result
         try:
             # If the number of matches to the closest reference profile is 53, return the profile number
             if sample.rmlst.matches == 53:
                 data += GenObject.returnattr(sample.rmlst, 'sequencetype')
             else:
                 # Create a set of all the genes present in the results (gene name split from allele)
                 rmlst_gene_set = {
                     gene.split('_')[0]
                     for gene in sample.rmlst.results
                 }
                 # If there are a full set of 53 genes, but no profile match, then this is a new profile
                 if len(rmlst_gene_set) == 53:
                     data += 'new,'
                 # Otherwise the profile is set to ND
                 else:
                     data += 'ND,'
         except KeyError:
             data += 'ND,'
         # MLST_Result
         try:
             if sample.mlst.matches == 7:
                 data += GenObject.returnattr(sample.mlst, 'sequencetype')
             else:
                 # Create a set of all the genes present in the results (gene name split from allele)
                 mlst_gene_set = {
                     gene.split('_')[0]
                     for gene in sample.mlst.results
                 }
                 # If there are all the genes present, but no perfect match to a reference profile, state that
                 # the profile is new
                 if len(mlst_gene_set) == 7:
                     data += 'new,'
                 # Otherwise indicate that the profile is ND
                 else:
                     data += 'ND,'
         except KeyError:
             data += 'ND,'
         # MLST_gene_X_alleles
         try:
             # Create a set of all the genes present in the results (gene name split from allele)
             gene_set = {gene.split('_')[0] for gene in sample.mlst.results}
             for gene in sorted(gene_set):
                 allele_list = list()
                 # Determine all the alleles that are present for each gene
                 for allele in sample.mlst.results:
                     if gene in allele:
                         allele_list.append(allele)
                 # If there is more than one allele in the sample, add both to the string separated by a ';'
                 if len(allele_list) > 1:
                     data += '{},'.format(';'.join(allele_list))
                 # Otherwise add the only allele
                 else:
                     data += allele_list[0] + ','
             # If there are fewer than seven matching alleles, add a ND for each missing result
             if len(gene_set) < 7:
                 data += (7 - len(gene_set)) * 'ND,'
         except KeyError:
             # data += '-,-,-,-,-,-,-,'
             data += 'ND,ND,ND,ND,ND,ND,ND,'
         # CoreGenesPresent
         data += GenObject.returnattr(sample.coregenome, 'coreresults')
         # E_coli_Serotype
         try:
             serotype = '{oset} ({opid}):{hset} ({hpid}),'\
                 .format(oset=';'.join(sample.serosippr.o_set),
                         opid=sample.serosippr.best_o_pid,
                         hset=';'.join(sample.serosippr.h_set),
                         hpid=sample.serosippr.best_h_pid)
             # Make sure that the string was populated with values rather than 'NA' or '-'
             if serotype == '- (-):- (-),':
                 data += 'ND,'
             else:
                 data += serotype
         except KeyError:
             data += 'ND,'
         # SISTR_serovar_antigen
         data += GenObject.returnattr(sample.sistr,
                                      'serovar_antigen').rstrip(';')
         # SISTR_serovar_cgMLST
         data += GenObject.returnattr(sample.sistr, 'serovar_cgmlst')
         # SISTR_serogroup
         data += GenObject.returnattr(sample.sistr, 'serogroup')
         # SISTR_h1
         data += GenObject.returnattr(sample.sistr, 'h1').rstrip(';')
         # SISTR_h2
         data += GenObject.returnattr(sample.sistr, 'h2').rstrip(';')
         # SISTR_serovar
         data += GenObject.returnattr(sample.sistr, 'serovar')
         # GeneSeekr_Profile
         try:
             if sample.genesippr.report_output:
                 data += ';'.join(sample.genesippr.report_output) + ','
             else:
                 data += 'ND,'
         except KeyError:
             data += 'ND,'
         # Vtyper_Profile
         try:
             # Since the vtyper attribute can be empty, check first
             profile = sorted(sample.vtyper.profile)
             if profile:
                 data += ';'.join(profile) + ','
             else:
                 data += 'ND,'
         except KeyError:
             data += 'ND,'
         # AMR_Profile and resistant/sensitive status
         if sample.resfinder_assembled.pipelineresults:
             # Profile
             data += ';'.join(
                 sorted(sample.resfinder_assembled.pipelineresults)) + ','
             # Resistant/Sensitive
             data += 'Resistant,'
         else:
             # Profile
             data += 'ND,'
             # Resistant/Sensitive
             data += 'Sensitive,'
         # Plasmid Result'
         try:
             plasmid_profile = sorted(sample.plasmidextractor.plasmids)
             if plasmid_profile:
                 data += ';'.join(plasmid_profile) + ','
             else:
                 data += 'ND,'
         except KeyError:
             data += 'ND,'
         # TotalPredictedGenes
         data += GenObject.returnattr(sample.prodigal,
                                      'predictedgenestotal')
         # PredictedGenesOver3000bp
         data += GenObject.returnattr(sample.prodigal,
                                      'predictedgenesover3000bp')
         # PredictedGenesOver1000bp
         data += GenObject.returnattr(sample.prodigal,
                                      'predictedgenesover1000bp')
         # PredictedGenesOver500bp
         data += GenObject.returnattr(sample.prodigal,
                                      'predictedgenesover500bp')
         # PredictedGenesUnder500bp
         data += GenObject.returnattr(sample.prodigal,
                                      'predictedgenesunder500bp')
         # NumClustersPF
         data += GenObject.returnattr(sample.run, 'NumberofClustersPF')
         # Percent of reads mapping to PhiX control
         data += GenObject.returnattr(sample.run, 'phix_aligned')
         # Error rate calculated from PhiX control
         data += GenObject.returnattr(sample.run, 'error_rate')
         # LengthForwardRead
         data += GenObject.returnattr(sample.run, 'forwardlength')
         # LengthReverseRead
         data += GenObject.returnattr(sample.run, 'reverselength')
         # Real time strain
         data += GenObject.returnattr(sample.run, 'Description')
         # Flowcell
         data += GenObject.returnattr(sample.run, 'flowcell')
         # MachineName
         data += GenObject.returnattr(sample.run, 'instrument')
         # PipelineVersion
         data += self.commit + ','
         # AssemblyDate
         data += datetime.now().strftime('%Y-%m-%d')
         # Append a new line to the end of the results for this sample
         data += '\n'
     # Replace any NA values with -
     cleandata = data.replace('NA', 'ND')
     with open(os.path.join(self.reportpath, 'combinedMetadata.csv'),
               'w') as metadatareport:
         metadatareport.write(header)
         metadatareport.write(cleandata)
예제 #2
0
    def legacy_reporter(self):
        """
        Creates an output that is compatible with the legacy metadata reports. This method will be removed once
        a new database scheme is implemented
        """
        from collections import OrderedDict
        printtime('Creating legacy summary report', self.starttime)
        row = ''
        # Create a dictionary of tuples to be printed in the final report
        for sample in self.metadata:
            data = OrderedDict([
                ('SampleName', sample.name),
                ('N50', str(sample.quality_features_polished.n50)),
                ('NumContigs',
                 str(sample.quality_features_polished.num_contigs)),
                ('TotalLength',
                 str(sample.quality_features_polished.genome_length)),
                ('MeanInsertSize', sample.mapping.MeanInsertSize),
                ('AverageCoverageDepth',
                 sample.mapping.MeanCoveragedata.split("X")[0]),
                ('ReferenceGenome', sample.mash.closestrefseq),
                ('RefGenomeAlleleMatches', '-'),
                ('16sPhylogeny', sample.sixteens_full.genus),
                ('rMLSTsequenceType', sample.rmlst.sequencetype),
                ('MLSTsequencetype', sample.mlst.sequencetype),
                ('MLSTmatches', str(sample.mlst.matchestosequencetype)),
                ('coreGenome',
                 GenObject.returnattr(sample.coregenome,
                                      'coreresults').rstrip(',')),
                ('SeroType', '{oset}:{hset}'.format(
                    oset=';'.join(sample.serosippr.o_set),
                    hset=';'.join(sample.serosippr.h_set))),
                ('geneSeekrProfile', ';'.join(result for result, pid in sorted(
                    sample.genesippr.results.items()))),
                ('vtyperProfile', ';'.join(sorted(sample.vtyper.profile))),
                ('percentGC', str(sample.quality_features_polished.gc)),
                ('TotalPredictedGenes',
                 str(sample.prodigal.predictedgenestotal)),
                ('predictedgenesover3000bp',
                 str(sample.prodigal.predictedgenesover3000bp)),
                ('predictedgenesover1000bp',
                 str(sample.prodigal.predictedgenesover1000bp)),
                ('predictedgenesover500bp',
                 str(sample.prodigal.predictedgenesover500bp)),
                ('predictedgenesunder500bp',
                 str(sample.prodigal.predictedgenesunder500bp)),
                ('SequencingDate', sample.run.Date),
                ('Investigator', sample.run.InvestigatorName),
                ('TotalClustersinRun', str(sample.run.TotalClustersinRun)),
                ('NumberofClustersPF', str(sample.run.NumberofClustersPF)),
                ('PercentOfClusters', str(sample.run.PercentOfClusters)),
                ('LengthofForwardRead', str(sample.run.forwardlength)),
                ('LengthofReverseRead', str(sample.run.reverselength)),
                ('Project', str(sample.run.SampleProject)),
                ('PipelineVersion', self.commit)
            ])

            if not row:
                row += ','.join([key for key, value in data.items()])
            row += '\n'
            row += ','.join([value for key, value in data.items()])
        cleanrow = row.replace('NA', '').replace(',-,', ',,')
        with open(os.path.join(self.reportpath, 'legacy_combinedMetadata.csv'),
                  'w') as metadatareport:
            metadatareport.write(cleanrow)