def handle(self, *args, **options):
        if options['genome']:
            self.GENOME_BUILD = options['genome'].strip()
            if self.GENOME_BUILD in SUPPORTED_GENOMES:
                # temp dir should be located on the same file system as the
                # base dir
                self.ANNOTATION_TEMP_DIR = os.path.join(
                    self.ANNOTATION_BASE_DIR, self.GENOME_BUILD)
                # create this directory in case it doesn't exist
                if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                    _mkdir(self.ANNOTATION_TEMP_DIR)
                if options['gene_annotation']:
                    # Human from GENCODE, fly from FlyBase, worm from WormBase
                    self.geneAnnotation(options['gene_annotation'])
                elif options['map_empirical']:
                    self.mappabilityEmpirical(options['map_empirical'])
                elif options['map_theoretical']:
                    self.mappabilityTheoretical(options['map_theoretical'])
                elif options['gap_regions']:
                    self.gapRegions(options['gap_regions'])
                elif options['gc']:
                    self.gcContent(options['gc'])
                elif options['conservation']:
                    self.conservation(options['conservation'])
            else:
                raise CommandError(
                    'Selected genome build currently not supported')

        else:
            raise CommandError(
                'Please specify which genome to build i.e. hg19, dm3')
    def handle_label(self, label, **options):
        """
        This function creates an annotation_server for Refinery 
        for a specific genome build: dm3, ce10, hg19
        """
        if label:
            if label in SUPPORTED_GENOMES:
                self.GENOME_BUILD = GenomeBuild.objects.get(name=label)
                self.GENOME_BUILD_NAME = self.GENOME_BUILD.name
                self.BASE_DOWNLOAD_URL = self.BASE_DOWNLOAD_URL % label

                # temp dir should be located on the same file system as the base dir
                self.ANNOTATION_TEMP_DIR = os.path.join(self.ANNOTATION_BASE_DIR, label)
                # create this directory in case it doesn't exist
                if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                    _mkdir(self.ANNOTATION_TEMP_DIR)

                # self.createGenomeModels()
                self.getChromInfo()
                self.getCytoBand()
                self.getGenes()

            else:
                raise CommandError("Selected genome build currently not supported")
        else:
            raise CommandError("Please specify which genome to build i.e. hg19, dm3")
 def handle(self, *args, **options):
     # ...
     if options['genome']:
         self.GENOME_BUILD = options['genome'].strip()
         
         if self.GENOME_BUILD in SUPPORTED_GENOMES:   
             # temp dir should be located on the same file system as the base dir
             self.ANNOTATION_TEMP_DIR = os.path.join(self.ANNOTATION_BASE_DIR, self.GENOME_BUILD)
             # create this directory in case it doesn't exist
             if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                 _mkdir(self.ANNOTATION_TEMP_DIR)
                 
             if options['gene_annotation']:
                 #Human from GENCODE, fly from FlyBase, worm from WormBase
                 self.geneAnnotation(options['gene_annotation'])
             elif options['map_empirical']:
                 self.mappabilityEmpirical(options['map_empirical'])
             elif options['map_theoretical']:
                 self.mappabilityTheoretical(options['map_theoretical'])
             elif options['gap_regions']:
                 self.gapRegions(options['gap_regions'])
             elif options['gc']:
                 self.gcContent(options['gc'])
             elif options['conservation']:
                 self.conservation(options['conservation'])
         else:
             raise CommandError('Selected genome build currently not supported')
     
     else:
         raise CommandError('Please specify which genome to build i.e. hg19, dm3')
Exemple #4
0
 def handle_label(self, label, **options):
     """Creates an annotation_server for Refinery
     for a specific genome build: dm3, ce10, hg19
     """
     if label:
         if label in utils.UPPORTED_GENOMES:
             self.GENOME_BUILD = models.GenomeBuild.objects.get(name=label)
             self.GENOME_BUILD_NAME = self.GENOME_BUILD.name
             self.BASE_DOWNLOAD_URL = self.BASE_DOWNLOAD_URL % label
             # temp dir should be located on the same file system as the
             # base dir
             self.ANNOTATION_TEMP_DIR = os.path.join(
                 self.ANNOTATION_BASE_DIR, label)
             # create this directory in case it doesn't exist
             if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                 _mkdir(self.ANNOTATION_TEMP_DIR)
             self.getChromInfo()
             self.getCytoBand()
             self.getGenes()
         else:
             raise CommandError(
                 'Selected genome build currently not supported')
     else:
         raise CommandError(
             'Please specify which genome to build i.e. hg19, dm3')
class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('--genome',
                    dest='genome',
                    help='Genome build i.e. dm3, ce10, hg19'),
        make_option('--gene_annotation',
                    dest='gene_annotation',
                    help='Additional genome information'),
        make_option(
            '--map_theoretical',
            dest='map_theoretical',
            help='Adding Annotation for Theoretical Mappability information'),
        make_option(
            '--map_empirical',
            dest='map_empirical',
            help='Adding Annotation for Empirical Mappability information'),
        make_option('--gap_regions',
                    dest='gap_regions',
                    help='Adding Annotation for Gap Regions information'),
        make_option('--gc', dest='gc',
                    help='Adding Annotation for GC content'),
        make_option('--conservation',
                    dest='conservation',
                    help='Adding Annotation for Conservation scores'),
    )
    can_import_settings = True
    GENOME_BUILD = None
    ANNOTATION_DIR = 'annotation_server'  # relative to MEDIA_ROOT

    # absolute path to the file store root dir
    ANNOTATION_BASE_DIR = os.path.join(settings.MEDIA_ROOT, ANNOTATION_DIR)
    # create this directory in case it doesn't exist
    if not os.path.isdir(ANNOTATION_BASE_DIR):
        _mkdir(ANNOTATION_BASE_DIR)

    def handle(self, *args, **options):
        if options['genome']:
            self.GENOME_BUILD = options['genome'].strip()
            if self.GENOME_BUILD in SUPPORTED_GENOMES:
                # temp dir should be located on the same file system as the
                # base dir
                self.ANNOTATION_TEMP_DIR = os.path.join(
                    self.ANNOTATION_BASE_DIR, self.GENOME_BUILD)
                # create this directory in case it doesn't exist
                if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                    _mkdir(self.ANNOTATION_TEMP_DIR)
                if options['gene_annotation']:
                    # Human from GENCODE, fly from FlyBase, worm from WormBase
                    self.geneAnnotation(options['gene_annotation'])
                elif options['map_empirical']:
                    self.mappabilityEmpirical(options['map_empirical'])
                elif options['map_theoretical']:
                    self.mappabilityTheoretical(options['map_theoretical'])
                elif options['gap_regions']:
                    self.gapRegions(options['gap_regions'])
                elif options['gc']:
                    self.gcContent(options['gc'])
                elif options['conservation']:
                    self.conservation(options['conservation'])
            else:
                raise CommandError(
                    'Selected genome build currently not supported')

        else:
            raise CommandError(
                'Please specify which genome to build i.e. hg19, dm3')

    def gcContent(self, wig_file):
        """Function for dealing w/ gc content annotation tracks"""
        logger.debug(
            "annotation_server.gcContent called for genome: "
            "%s, file: %s", self.GENOME_BUILD, wig_file)
        db_table = self.GENOME_BUILD + "_GC"
        self.addWigAnnotation(wig_file, db_table, "GC")

    def conservation(self, wig_file):
        """Function for dealing conservation annotation tracks"""
        logger.debug(
            "annotation_server.conservation called for genome: "
            "%s, file: %s", self.GENOME_BUILD, wig_file)
        db_table = self.GENOME_BUILD + "_Conservation"
        self.addWigAnnotation(wig_file, db_table, "conservation")

    def addWigAnnotation(self, wig_file, db_model, annot_type):
        """General function for adding Wig files into the annotation_server
        specifically for Conservation and GC content
        """
        current_table = eval(db_model)
        current_table.objects.all().delete()
        handle = getFileHandle(wig_file)
        for line in handle:
            # TODO: what to do with additional description fields
            if line[0] != '#':
                t1 = line.strip().split(' ')
                # first descriptive line of
                if t1[0] == 'track':
                    # overwrite if already entered into db
                    try:
                        item = WigDescription.objects.get(
                            genome_build=self.GENOME_BUILD,
                            annotation_type=annot_type)
                        item.delete()
                    except (WigDescription.DoesNotExist,
                            WigDescription.MultipleObjectsReturned) \
                            as e:
                        logger.error("%s for genome: %s, annotation_type: %s",
                                     e, self.GENOME_BUILD, annot_type)

                    ret_attr = parse_wig_attribute(line)
                    table_vals = [
                        'name', 'altColor', 'color', 'visibility', 'priority',
                        'type', 'description'
                    ]
                    db_string = "WigDescription(genome_build='%s', " \
                                "annotation_type='%s', %s)"
                    db_string = db_string % (self.GENOME_BUILD, annot_type,
                                             parse_db_string(
                                                 ret_attr, table_vals))
                    # saving to wigDescription model
                    item = eval(db_string)
                    item.save()

                elif t1[0] == 'fixedStep':
                    attr = parse_wig_attribute(line)
                    chrom = attr['chrom']
                    start = int(attr['start'])
                    step = int(attr['step'])
                    curr_pos = start
                else:
                    curr_pos += step
                    # adding to django model
                    wigItem = current_table(annot=item,
                                            chrom=chrom,
                                            position=curr_pos,
                                            value=t1[0])
                    wigItem.save()

    def mappabilityTheoretical(self, bed_file):
        """Annotation tracks pertaining to Theoretical Mappability
        params: annotation_file
        """
        logger.debug(
            "annotation_server.mappabilityTheoretical called for "
            "genome: %s, file: %s", self.GENOME_BUILD, bed_file)
        if self.GENOME_BUILD == 'hg19':
            db_table = self.GENOME_BUILD + "_MappabilityTheoretical"
            self.addBedAnnotation(bed_file, db_table)
        else:
            raise CommandError(
                'Theoretical Mappability Annotation track only supported for '
                'hg19 currently')

    def mappabilityEmpirical(self, bed_file):
        """Annotation tracks pertaining to Empirical Mappability
        params: annotation_file
        """
        logger.debug(
            "annotation_server.mappabilityEmpirical called for "
            "genome: %s, file: %s", self.GENOME_BUILD, bed_file)
        db_table = self.GENOME_BUILD + "_MappabilityEmpirial"
        if (self.GENOME_BUILD == 'hg19' or self.GENOME_BUILD == 'dm3'
                or self.GENOME_BUILD == 'ce10'):
            self.addBedAnnotation(bed_file, db_table)
        else:
            raise CommandError(
                'Empirical Mappability Annotation track only supported for '
                'hg19 currently')

    def gapRegions(self, bed_file):
        """Annotation tracks pertaining to GapRegions
        params: annotation_file
        """
        logger.debug(
            "annotation_server.gapRegions called for "
            "genome: %s, file: %s", self.GENOME_BUILD, bed_file)
        db_table = self.GENOME_BUILD + "_GapRegions"

        if self.GENOME_BUILD == 'hg19' or self.GENOME_BUILD == 'dm3':
            self.addGapRegions(bed_file, db_table)
        else:
            raise CommandError(
                'GapRegions Annotation track only supported for hg19 and dm3')

    def geneAnnotation(self, gff_file):
        """Adding additional annotation tracks from
        Gencode / Wormbase / Flybase based on the GFF file format
        params: annotation_file
        """
        logger.debug(
            "annotation_server.addGeneAnnotation called for "
            "genome: %s, file: %s", self.GENOME_BUILD, gff_file)

        if self.GENOME_BUILD == 'hg19':
            # Adding Human gencode annotation to refinery models
            # attributes part of the annotation model
            db_table = self.GENOME_BUILD + "_GenCode"
            db_vars = [
                'gene_id', 'transcript_id', 'gene_type', 'gene_status',
                'gene_name', 'transcript_type', 'transcript_status',
                'transcript_name'
            ]
            self.addGeneAnnotation(gff_file, db_table, db_vars)

        elif self.GENOME_BUILD == 'ce10':
            db_table = self.GENOME_BUILD + "_WormBase"
            db_vars = ['cds', 'clone', 'gene', 'Name']
            self.addGeneAnnotation(gff_file, db_table, db_vars)

        elif self.GENOME_BUILD == 'dm3':
            db_table = self.GENOME_BUILD + "_FlyBase"
            db_vars = ['Alias', 'description', 'fullname', 'symbol']
            self.addGeneAnnotation(gff_file, db_table, db_vars)

    def addGeneAnnotation(self, gff_file, db_model, table_vals):
        """Function for adding additional gene annotation files
        i.e hg19 gencode, dm3 flybase, ce10 wormbase
        """
        current_table = eval(db_model)
        current_table.objects.all().delete()
        handle = getFileHandle(gff_file)
        for line in handle:
            # TODO: what to do with additional description fields
            if line[0] != '#':
                t1 = line.strip().split('\t')
                attrib = parse_gff_attribute(t1[8])
                db_string = (
                    'current_table(chrom=t1[0], source=t1[1], feature=t1[2], '
                    'start=t1[3], end=t1[4], score=t1[5], strand=t1[6], '
                    'frame=t1[7], attribute=t1[8], %s)')
                parse_db_string(attrib, table_vals)
                db_string = db_string % (parse_db_string(attrib, table_vals))
                item = eval(db_string)
                item.save()

    def addGapRegions(self, bed_file, db_model):
        """Function for adding additional annotation files giving in BED format
        encode project i.e. gap_regions
        """
        logger.debug(
            "annotation_server.addGapRegions called for "
            "genome: %s, file: %s table: %s", self.GENOME_BUILD, bed_file,
            db_model)

        current_table = eval(db_model)
        current_table.objects.all().delete()
        handle = getFileHandle(bed_file)
        for line in handle:
            if line[0] != '#':
                t1 = line.strip().split('\t')
                item = current_table(bin=t1[0],
                                     chrom=t1[1],
                                     chromStart=t1[2],
                                     chromEnd=t1[3],
                                     ix=t1[4],
                                     n=t1[5],
                                     size=t1[6],
                                     type=t1[7],
                                     bridge=t1[8])
                item.save()

    def addBedAnnotation(self, bed_file, db_model):
        """Function for adding additional annotation files giving in BED format
        encode project i.e. empirical mappability, theoretical mappability
        """
        logger.debug(
            "annotation_server.addBedAnnotation called for "
            "genome: %s, file: %s table: %s", self.GENOME_BUILD, bed_file,
            db_model)

        current_table = eval(db_model)
        current_table.objects.all().delete()
        handle = getFileHandle(bed_file)
        for line in handle:
            if line[0] != '#':
                t1 = line.strip().split('\t')
                item = current_table(chrom=t1[0],
                                     chromStart=t1[1],
                                     chromEnd=t1[2])
                item.save()
Exemple #6
0
class Command(LabelCommand):
    """Management command for creating basic annotation server
    for a specific species and genome build i.e. dm3, ce10, hg19
    """
    # allows Django settings to be used, define scratch space for download
    can_import_settings = True
    BASE_DOWNLOAD_URL = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/'
    SEQUENCE_FILES = 'chromosomes/'
    OTHER_FILES = 'database/'
    GENOME_BUILD = None
    GENOME_BUILD_NAME = None
    ANNOTATION_DIR = 'annotation_server'  # relative to MEDIA_ROOT

    # absolute path to the file store root dir
    ANNOTATION_BASE_DIR = os.path.join(settings.MEDIA_ROOT, ANNOTATION_DIR)
    # create this directory in case it doesn't exist
    if not os.path.isdir(ANNOTATION_BASE_DIR):
        _mkdir(ANNOTATION_BASE_DIR)
    """
    Name: handle
    Description:
    main program; run the command
    """
    def handle_label(self, label, **options):
        """Creates an annotation_server for Refinery
        for a specific genome build: dm3, ce10, hg19
        """
        if label:
            if label in utils.UPPORTED_GENOMES:
                self.GENOME_BUILD = models.GenomeBuild.objects.get(name=label)
                self.GENOME_BUILD_NAME = self.GENOME_BUILD.name
                self.BASE_DOWNLOAD_URL = self.BASE_DOWNLOAD_URL % label
                # temp dir should be located on the same file system as the
                # base dir
                self.ANNOTATION_TEMP_DIR = os.path.join(
                    self.ANNOTATION_BASE_DIR, label)
                # create this directory in case it doesn't exist
                if not os.path.isdir(self.ANNOTATION_TEMP_DIR):
                    _mkdir(self.ANNOTATION_TEMP_DIR)
                self.getChromInfo()
                self.getCytoBand()
                self.getGenes()
            else:
                raise CommandError(
                    'Selected genome build currently not supported')
        else:
            raise CommandError(
                'Please specify which genome to build i.e. hg19, dm3')

    def getChromInfo(self):
        logger.debug("annotation_server.getChromInfo called for genome: %s",
                     self.GENOME_BUILD_NAME)

        url, file_name = self.getUrlFile('chromInfo.txt.gz')
        download_http_file(url, '', self.ANNOTATION_TEMP_DIR, as_task=False)
        # deletes objects from ChromInfo table for that genome build
        models.ChromInfo.objects.filter(
            genomebuild__name__exact=self.GENOME_BUILD_NAME).delete()
        # reading gz file
        handle = gzip.open(file_name)
        # http://stackoverflow.com/q/3548495
        for line in handle:
            t1 = line.strip().split('\t')
            # Not including extraneous sequences i.e. chr6_ssto_hap7,
            # chr6_random
            if str(t1[0]).find('_') == -1:
                item = models.ChromInfo(genomebuild=self.GENOME_BUILD,
                                        chrom=t1[0],
                                        size=t1[1],
                                        fileName=t1[2])
                item.save()
        return

    def getCytoBand(self):
        logger.debug("annotation_server.getCytoBand called for genome: %s",
                     self.GENOME_BUILD_NAME)

        url, file_name = self.getUrlFile('cytoBand.txt.gz')
        download_http_file(url, '', self.ANNOTATION_TEMP_DIR, as_task=False)
        # deletes all objects from table
        models.CytoBand.objects.filter(
            genomebuild__name__exact=self.GENOME_BUILD_NAME).delete()
        # reading gz file
        # handle = gzip.open(file_name)
        # for line in handle:
        #     t1 = line.strip().split('\t')
        #     # FIXME: current_table is unresolved
        #     # Fritz (2016-02-22): This method does not exist in Refinery's
        #     # code base. Not sure what's going on here.
        #     item = current_table(
        #         genomebuild=self.GENOME_BUILD, chrom=t1[0], chromStart=t1[1],
        #         chromEnd=t1[2], name=t1[3], gieStain=t1[4]
        #     )
        #     item.save()
        return

    def getGenes(self):
        logger.debug("annotation_server.getGenes called for genome: %s",
                     self.GENOME_BUILD_NAME)

        url, file_name = self.getUrlFile('ensGene.txt.gz')
        download_http_file(url, '', self.ANNOTATION_TEMP_DIR, as_task=False)
        # deletes all objects of that genome build from table
        models.Gene.objects.filter(
            genomebuild__name__exact=self.GENOME_BUILD_NAME).delete()
        # reading gz file
        handle = gzip.open(file_name)
        for line in handle:
            t1 = line.strip().split('\t')
            item = models.Gene(genomebuild=self.GENOME_BUILD,
                               bin=t1[0],
                               name=t1[1],
                               chrom=t1[2],
                               strand=t1[3],
                               txStart=t1[4],
                               txEnd=t1[5],
                               cdsStart=t1[6],
                               cdsEnd=t1[7],
                               exonCount=t1[8],
                               exonStarts=t1[9],
                               exonEnds=t1[10],
                               score=t1[11],
                               name2=t1[12],
                               cdsStartStat=t1[13],
                               cdsEndStat=t1[14],
                               exonFrames=t1[15])
            item.save()
        return

    def getUrlFile(self, file_to_download, sequence=False):
        """Helper function to return UCSC url to download file and current path
        for file to download
        """
        logger.debug(
            "annotation_server.create_genome_annotation getUrlFile"
            "called build: %s file: %s", self.GENOME_BUILD_NAME,
            file_to_download)
        if sequence:
            url = self.BASE_DOWNLOAD_URL + self.SEQUENCE_FILES + \
                  file_to_download
        else:
            url = self.BASE_DOWNLOAD_URL + self.OTHER_FILES + file_to_download

        file_name = os.path.join(self.ANNOTATION_TEMP_DIR, url.split('/')[-1])

        return url, file_name