Example #1
0
    def testExtentOutOfRangeError(self):
        ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context.  
        Use what is left for gc_content as well.'''
        ds = ReferenceDatasource('testdata/reference_ds',
                                 windowSizeRef=6,
                                 windowSizeGCContent=5)
        m = MutationDataFactory.default_create()
        m.chr = "22"
        m.start = "4"
        m.end = "4"

        # "CCCAAGCTAAACCCAGGCCAC"
        groundTruth = "CCCAAGCTAA"

        guess = ds.annotate_mutation(m)

        self.assertTrue(
            guess['ref_context'] == groundTruth,
            "ref_context was not populated properly: " +
            str(guess['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(
            fabs(float(guess['gc_content']) - (float(5) / float(9))) < .001,
            "gc_content was not populated properly: " +
            str(guess['gc_content']))
 def testEmptyAnswer(self):
     ''' The Reference Datasource should return a blank result if the chromosome is not found.
     Note: A log entry should also be written, but this is not tested. '''
     self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found.  Please add it.")
     ds = ReferenceDatasource('testdata/reference_ds')
     m = MutationDataFactory.default_create()
     m.chr = "THIS_DOES_NOT_EXIST"
     m.start = "11"
     m.end = "11"
     
     groundTruth = ""
     # remember that the annotate_mutation returns a generator, so we use an iterator
     guess = ds.annotate_mutation(m)
     self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
Example #3
0
 def testEmptyAnswer(self):
     ''' The Reference Datasource should return a blank result if the chromosome is not found.
     Note: A log entry should also be written, but this is not tested. '''
     self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found.  Please add it.")
     ds = ReferenceDatasource('testdata/reference_ds')
     m = MutationData()
     m.chr = "THIS_DOES_NOT_EXIST"
     m.start = "11"
     m.end = "11"
     
     groundTruth = ""
     # remember that the annotate_mutation returns a generator, so we use an iterator
     guess = ds.annotate_mutation(m)
     self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
    def testSimpleGLAnnotate(self):
        ''' Test a simple annotation case.  Make sure that the ref_context and gc_content annotations are correct. '''
        ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5)
        m = MutationDataFactory.default_create()
        m.chr = "GL000211.1"
        m.start = "11"
        m.end = "11"
        
        groundTruth = "gaattctttttcaagtaagtc"
        
        guess = ds.annotate_mutation(m)
        
        self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
Example #5
0
    def testSimpleGLAnnotate(self):
        ''' Test a simple annotation case.  Make sure that the ref_context and gc_content annotations are correct. '''
        ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5)
        m = MutationData()
        m.chr = "GL000211.1"
        m.start = "11"
        m.end = "11"
        
        groundTruth = "gaattctttttcaagtaagtc"
        
        guess = ds.annotate_mutation(m)
        
        self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
    def testSimpleAnnotate(self):
        ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer.
        '''
        ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5)
        m = MutationDataFactory.default_create()
        m.chr = "22"
        m.start = "11"
        m.end = "11"
        
        groundTruth = "CCCAAGCTAAACCCAGGCCAC"

        guess = ds.annotate_mutation(m)
        
        self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
Example #7
0
    def testSimpleAnnotate(self):
        ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer.
        '''
        ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5)
        m = MutationData()
        m.chr = "22"
        m.start = "11"
        m.end = "11"
        
        groundTruth = "CCCAAGCTAAACCCAGGCCAC"

        guess = ds.annotate_mutation(m)
        
        self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
    def testExtentOutOfRangeError(self):
        ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context.  
        Use what is left for gc_content as well.'''
        ds = ReferenceDatasource('testdata/reference_ds', windowSizeRef=6, windowSizeGCContent=5)
        m = MutationDataFactory.default_create()
        m.chr = "22"
        m.start = "4"
        m.end = "4"
        
        # "CCCAAGCTAAACCCAGGCCAC"
        groundTruth = "CCCAAGCTAA"
        
        guess = ds.annotate_mutation(m)
        
        self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context']))

        # gc_content is rounded to 3 decimal places
        self.assertTrue(fabs(float(guess['gc_content']) - (float(5)/float(9))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
Example #9
0
 def testFilenameDetermination(self):
     ''' Test that proper conversions are being done for chromosome to flat filename '''
     ds = ReferenceDatasource('testdata/reference_ds')
     self.assertTrue(
         ds.convertMutationChrToFilename("GL000211.1") ==
         'chrUn_gl000211.txt', "Did not find GL file: " +
         str(ds.convertMutationChrToFilename("GL000211.1")))
     self.assertTrue(
         ds.convertMutationChrToFilename("X") == 'chrX.txt',
         "Did not find chrX file: " +
         str(ds.convertMutationChrToFilename("X")))
     self.assertTrue(
         ds.convertMutationChrToFilename("GL000209.1") ==
         'chr19_gl000209_random.txt', "Did not find GL chr19 file: " +
         str(ds.convertMutationChrToFilename("GL000209.1")))
 def testFilenameDetermination(self):
     ''' Test that proper conversions are being done for chromosome to flat filename '''
     ds = ReferenceDatasource('testdata/reference_ds')
     self.assertTrue(ds.convertMutationChrToFilename("GL000211.1") == 'chrUn_gl000211.txt', "Did not find GL file: " + str(ds.convertMutationChrToFilename("GL000211.1"))) 
     self.assertTrue(ds.convertMutationChrToFilename("X") == 'chrX.txt', "Did not find chrX file: " + str(ds.convertMutationChrToFilename("X")))
     self.assertTrue(ds.convertMutationChrToFilename("GL000209.1") == 'chr19_gl000209_random.txt', "Did not find GL chr19 file: " + str(ds.convertMutationChrToFilename("GL000209.1")))
Example #11
0
 def createReferenceDatasource(config):
     refFilename = config.get("ref_hg", "refDir")
     return ReferenceDatasource(refFilename)
Example #12
0
    def createDatasourceFromConfigParser(configParser, leafDir):
        """
        configParser -- config parser instance from the config file in the leafdir. For information on config file format/conventions see (TODO)
        
        leafDir -- contains the file and necessary files (post indexing and install steps) to instantiate a datasource.

        """
        result = None
        # Determine the type
        dsType = configParser.get("general", "type")
        
        # TODO: Replace these if statements with something a bit more robust, such as a proper dependency injection framework
        filePrefix = leafDir + "/"
        if dsType == "gaf":
            gaf_fname = filePrefix + configParser.get('general', 'gaf_fname')
            gaf_transcript_sequences_fname = filePrefix + configParser.get('general', 'gaf_transcript_seqs_fname')
            result = Gaf(gaf_fname, gaf_transcript_sequences_fname, title=configParser.get("general", "title"), version=configParser.get("general", "version"), protocol=configParser.get("general", "protocol"))
        elif dsType == "dbsnp":
            result = dbSNP(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version'))
        elif dsType == "ensembl":
            result = EnsemblTranscriptDatasource(filePrefix + configParser.get('general', 'src_file'),
                                                 title=configParser.get('general', 'title'),
                                                 version=configParser.get('general', 'version'),
                                                 tx_filter=configParser.get('general', 'transcript_filter'))
        elif dsType == "cosmic":
            result = Cosmic(src_file=filePrefix + configParser.get('general', 'src_file'), version=configParser.get('general', 'version'), gpp_tabix_file=filePrefix + configParser.get('general', 'gpp_src_file'))
        elif dsType == 'ref':
            if configParser.has_option('general', 'windowSizeRef'):
                window_size = configParser.get('general', 'windowSizeRef')
            else:
                window_size = 10
            result = ReferenceDatasource(filePrefix, title=configParser.get("general", "title"), version=configParser.get('general', 'version'), windowSizeRef=window_size)
        elif dsType == 'gene_tsv':
            result = GenericGeneDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'gene_col'))
        elif dsType == 'transcript_tsv':
            result = GenericTranscriptDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'transcript_col'))
        elif dsType == 'vc_tsv':
            result = GenericVariantClassificationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'vc_col'))
        elif dsType == 'gp_tsv':
            result = GenericGenomicPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'))
        elif dsType == 'gm_tsv':
            result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'))
        elif dsType == 'gm_tsv_reverse_complement':
            result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'), use_complementary_strand_alleles_for_negative_strand_transcripts=True)
        elif dsType == 'gpp_tsv':
            result = GenericGeneProteinPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'gene_protein_position_cols'))
        elif dsType == "transcript_to_uniprot_aa":
            result = TranscriptToUniProtProteinPositionTransformingDatasource(title=configParser.get("general", "title"),
                                                                              version=configParser.get('general', 'version'),
                                                                              src_file="file://" + filePrefix + configParser.get('general', 'src_file'), # three slashes for sqlite
                                                                              inputPositionAnnotationName=configParser.get('general', 'inputPositionAnnotationName'),
                                                                              outputPositionAnnotationName=configParser.get('general','outputPositionAnnotationName'))
        
        elif dsType == "mock_exception":
            result = MockExceptionThrowingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version'))
        elif dsType == "indexed_vcf":
            result = IndexedVcfDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),
                                           title=configParser.get("general", "title"),
                                           version=configParser.get('general', 'version'),
                                           match_mode=configParser.get('general', 'match_mode'))
        elif dsType == "indexed_tsv":
            columnNames = configParser.get("general", "column_names")
            columnNames = columnNames.split(",")

            annotationColumnNames = configParser.get("general", "annotation_column_names")
            annotationColumnNames = annotationColumnNames.split(",")

            indexColumnNames = configParser.get("general", "index_column_names")
            indexColumnNames = indexColumnNames.split(",")

            DatasourceFactory._log_missing_column_name_msg(columnNames, annotationColumnNames)

            columnDataTypes = dict()
            for columnName in annotationColumnNames:
                if columnName.strip() == "":
                    continue
                columnDataTypes[columnName] = configParser.get("data_types", columnName)

            result = IndexedTsvDatasource(src_file=filePrefix + configParser.get("general", "src_file"),
                                           title=configParser.get("general", "title"),
                                           version=configParser.get("general", "version"),
                                           colNames=columnNames,
                                           annotationColNames=annotationColumnNames,
                                           indexColNames=indexColumnNames,
                                           match_mode=configParser.get("general", "match_mode"),
                                           colDataTypes=columnDataTypes)

        
        elif dsType == 'bigwig':
            if not NGSLIB_INSTALLED:
                raise RuntimeError("Bigwig datasource found in db-dir but ngslib library not installed.")
            result = BigWigDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'))
        else:
            raise RuntimeError('Unknown datasource type: %s' % dsType)


        hashcode = DatasourceFactory._retrieve_hash_code(leafDir)
        result.set_hashcode(hashcode)
        return result