Beispiel #1
0
def change_accessions(ids, input_format, output_format, species,
                      tmp):  # refseq->ensemble->entrez;
    if input_format != output_format:
        mart_file = 'biomart_%s_%s_%s.ipage.pickle' % (species, input_format,
                                                       output_format)
        mart_file = os.path.join(tmp, mart_file)
        if os.path.isfile(mart_file) and os.stat(mart_file).st_size != 0:
            with open(mart_file, 'rb') as f:
                input_to_output = pickle.load(f)

        else:
            if species == 'mouse':
                dataset = pybiomart.Dataset(name='mmusculus_gene_ensembl',
                                            host='http://www.ensembl.org')
            elif species == 'human':
                dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl',
                                            host='http://www.ensembl.org')
            # print(*dataset.attributes.keys(), sep='\n')
            mart_attributes = {
                'enst': ['ensembl_transcript_id'],
                'ensg': ['ensembl_gene_id'],
                'refseq': [
                    'refseq_mrna', 'refseq_mrna_predicted', 'refseq_ncrna',
                    'refseq_ncrna_predicted'
                ],
                'entrez': ['entrezgene_id'],
                'gs': ['entrezgene_accession'],
                'ext': ['external_gene_name']
            }
            input_to_output = {}
            output_attributes = mart_attributes[output_format]
            if output_format == 'refseq':
                output_attributes = [output_attributes[0]]
            for mart in mart_attributes[input_format]:
                df1 = dataset.query(attributes=[mart] + output_attributes)
                df1 = df1[df1.iloc[:, 0].notna()]
                df1 = df1[df1.iloc[:, 1].notna()]
                if input_format == 'entrez' or output_format == 'entrez':
                    df1['NCBI gene ID'] = df1['NCBI gene ID'].apply(
                        lambda x: '%.f' % x)
                if input_format == 'gene_symbol' or output_format == 'gene_symbol':
                    upper = lambda x: x.upper() if type(x) == str else x
                    df1['NCBI gene accession'] = df1[
                        'NCBI gene accession'].apply(upper)
                input_to_output = {
                    **input_to_output,
                    **dict(zip(df1.iloc[:, 0], df1.iloc[:, 1]))
                }
            with open(mart_file, 'wb') as f:
                pickle.dump(input_to_output, f, pickle.HIGHEST_PROTOCOL)
        new_ids = []
        for id_ in ids:
            if id_ in input_to_output.keys():
                new_ids.append(input_to_output[id_])
            else:
                new_ids.append('-')
        return new_ids
    else:
        return ids
Beispiel #2
0
def save_as_bed(ensembl_ids, output_location, output_name, header):
    '''
    Take in pandas dataframe of 1 column containing a list of Ensembl
    IDs, use the pybiomart package to query each Ensembl ID one by one
    and return a BED file row for each one. Save the output as a .BED
    file with a header containing the panel information.

    NOTE: Loop through one by one as biomart cant deal with lots of
    inputs (not sure what max is - could speed this up using batches?)
    '''

    # Load in pybiomart dataset - GRCh37
    dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl',
                                host='http://grch37.ensembl.org')

    # Query each ID one at a time and add to list
    bed_list = []
    for gene in ensembl_ids:
        query = dataset.query(attributes=[
            'chromosome_name', 'start_position', 'end_position',
            'external_gene_name'
        ],
                              filters={'link_ensembl_gene_id': gene})
        bed_list += [query.to_csv(header=False, index=False, sep="\t")]

    # Save BED file
    out = str(output_location) + str(output_name) + '.bed'
    csv_out = open(out, 'w')
    csv_out.write(str(header) + '\n')
    for item in sorted(bed_list):
        csv_out.write(item)
    csv_out.close()
Beispiel #3
0
def fetch_ensembl_exons(build='37'):
    """Fetch the ensembl genes
    
    Args:
        build(str): ['37', '38']
    """
    LOG.info("Fetching ensembl exons build %s ...", build)
    if build == '37':
        url = 'http://grch37.ensembl.org'
    else:
        url = 'http://www.ensembl.org'

    dataset_name = 'hsapiens_gene_ensembl'

    dataset = pybiomart.Dataset(name=dataset_name, host=url)

    attributes = [
        'chromosome_name', 'ensembl_gene_id', 'ensembl_transcript_id',
        'ensembl_exon_id', 'exon_chrom_start', 'exon_chrom_end', '5_utr_start',
        '5_utr_end', '3_utr_start', '3_utr_end', 'strand', 'rank'
    ]

    filters = {
        'chromosome_name': CHROMOSOMES,
    }

    result = dataset.query(attributes=attributes, filters=filters)

    return result
Beispiel #4
0
 def _get_attributes(self, attributes=None,
                    dataset_name='mmusculus_gene_ensembl'):
     """
     Get gene attributes and find principal transcripts.
     Called after ensembl_gene_id query. Dependent on pybiomart package.
     """
     # Set the dataset. Default to mouse genes.
     dataset = bm.Dataset(name=dataset_name,
                  host='http://www.ensembl.org')
     
     # Set the attributes and filters for query.
     # Some temporary hard-coding here.
     if attributes == None:
         attributes = ['mgi_symbol', 'ensembl_gene_id', 'ensembl_gene_id_version', \
                       'ensembl_transcript_id', 'ensembl_transcript_id_version', \
                       'transcript_appris', 'transcript_length', \
                       'gene_biotype', 'transcript_count']
     filters = {'link_ensembl_gene_id': self.gene_list['ensembl_gene_id'].tolist()}
     
     # Retrieve information
     query_result = dataset.query(attributes=attributes, filters = filters)
     
     #####################################################
     ##### Find the transcript to use for each gene. #####
     #####################################################
     
     # Create a new column for the chosen transcript
     self.gene_list['ensembl_transcript_id_version'] = ''
     
     # For lncRNA, choose the longest transcript.
     lnc_qr = query_result[query_result['Gene type'] == 'lncRNA']
     lnc_ind = lnc_qr.groupby(['Gene stable ID']) \
             ['Transcript length (including UTRs and CDS)'].idxmax()
     lnc_qr = query_result.loc[lnc_ind]
     for ind,row in lnc_qr.iterrows():
         self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'],
                           'ensembl_transcript_id_version'] = row.loc['Transcript stable ID version']
         if self.verbose:
             print('\n%s' % self.gene_list[self.gene_list['ensembl_gene_id'] == row['Gene stable ID']])        
     
     # For protein coding genes, select the one with smallest APPRIS annotation
     prot_qr = query_result[query_result['Gene type'] == 'protein_coding']
     prot_qr['appris_rank'] = prot_qr['APPRIS annotation'].apply(self._appris2rank)
     # lowest APPRIS rank
     prot_ind = prot_qr.groupby(['Gene stable ID'])['appris_rank'].idxmin()
     prot_qr = query_result.loc[prot_ind]
     # longest
     prot_ind = prot_qr.groupby(['Gene stable ID'])['Transcript length (including UTRs and CDS)'].idxmax()
     prot_qr = query_result.loc[prot_ind]
     # Write the selected transcript
     for ind,row in prot_qr.iterrows():
         self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'],
                           'ensembl_transcript_id_version'] = row.loc['Transcript stable ID version']
         self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'],
                           'appris_rank'] = row.loc['APPRIS annotation']
         self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'],
                           'transcript_length'] = row.loc['Transcript length (including UTRs and CDS)']
Beispiel #5
0
def _homology_map(from_org, to_org, host, cache=True):
    # Determine column names for version.
    from_column = 'ensembl_gene_id'
    to_column = to_org + '_homolog_ensembl_gene'

    # Get map_frame from Ensembl.
    dataset = pybiomart.Dataset(host=host,
                                name=from_org + '_gene_ensembl',
                                use_cache=cache)
    map_frame = dataset.query(attributes=[from_column, to_column])

    # Override map names to reflect requested types.
    map_frame.columns = [
        _format_name(from_org, 'ensembl'),
        _format_name(to_org, 'ensembl')
    ]

    return _convert_to_str(map_frame)
Beispiel #6
0
def _id_map(from_type, to_type, host, organism='hsapiens', cache=True):
    # Try to lookup column as alias.
    from_column = ID_ALIASES.get(from_type, from_type)
    to_column = ID_ALIASES.get(to_type, to_type)

    # Get map_frame from Ensembl.
    dataset = pybiomart.Dataset(host=host,
                                name=organism + '_gene_ensembl',
                                use_cache=cache)

    map_frame = dataset.query(attributes=[from_column, to_column])

    # Override map names to reflect requested types.
    map_frame.columns = [
        _format_name(organism, from_type),
        _format_name(organism, to_type)
    ]

    return _convert_to_str(map_frame)
Beispiel #7
0
def fetch_ensembl_transcripts(build='37', chromosomes=None):
    """Fetch the ensembl genes
    
    Args:
        build(str): ['37', '38']
        chromosomes(iterable(str))
    
    Returns:
        result(DataFrame)
    """
    chromosomes = chromosomes or CHROMOSOMES
    LOG.info("Fetching ensembl transcripts build %s ...", build)
    if build == '37':
        url = 'http://grch37.ensembl.org'
    else:
        url = 'http://www.ensembl.org'
    
    dataset_name = 'hsapiens_gene_ensembl'
    
    dataset = pybiomart.Dataset(name=dataset_name, host=url)
    
    attributes = [
        'chromosome_name',
        'ensembl_gene_id',
        'ensembl_transcript_id',
        'transcript_start',
        'transcript_end',
        'refseq_mrna',
		'refseq_mrna_predicted',
		'refseq_ncrna',
    ]
    
    filters = {
        'chromosome_name': chromosomes,
    }
    
    result = dataset.query(
        attributes = attributes,
        filters = filters,
        use_attr_names=True,
    )
    
    return result
Beispiel #8
0
def fetch_ensembl_genes(build='37'):
    """Fetch the ensembl genes
    
    Args:
        build(str): ['37', '38']
    """
    if build == '37':
        url = 'http://grch37.ensembl.org'
    else:
        url = 'http://www.ensembl.org'
    
    LOG.info("Fetching ensembl genes from %s", url)
    dataset_name = 'hsapiens_gene_ensembl'
    
    dataset = pybiomart.Dataset(name=dataset_name, host=url)
    
    attributes = [
        'chromosome_name',
        'start_position',
        'end_position',
        'ensembl_gene_id',
        'hgnc_symbol',
        'hgnc_id',
    ]
    
    filters = {
        'chromosome_name': CHROMOSOMES,
    }
    
    result = dataset.query(
        attributes = attributes,
        filters = filters,
        use_attr_names=True,
    )
    
    return result
Beispiel #9
0
        # 'chromosome_name': ['1','2'], #'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y',
        'transcript_biotype': 'protein_coding',
        # 'link_ensembl_gene_id': 'ENSG00000139618'
    }

    use_filters = filters

    # server = pybiomart.Server(host=server_url)
    # marts = server.list_marts()
    # print(marts)

    #server = pybiomart.Server(host='http://www.ensembl.org')
    #print(server.list_marts())
    #sys.exit()

    dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host=server_url)

    if int(release) <= 78:
        use_attributes = []
        for attribute in sequence_attributes:
            if attribute in dataset.attributes:
                use_attributes.append(attribute)
            elif attribute == 'external_gene_name':
                use_attributes.append('external_gene_id')
                print('Replacing "external_gene_name" with "external_gene_id"')
            else:
                print('Removing Attribute: %s' % attribute)

        use_filters = {'biotype': filters['transcript_biotype']}

    #print(dataset.list_filters())
Beispiel #10
0
    type=int,
    action="store",
    dest="seed",
    default=0,
    help="Use this integer seed for reproducibility."
)

args = parser.parse_args()


singularity run -B /ddn1 /ddn1/vol1/site_scratch/leuven/325/vsc32528/sif/vibsinglecellnf-pycistopic-0.1.img ipython

args.sampleId = 'sample_test'
args.fragments = '/ddn1/vol1/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis/nextflow/test/out/fragments/VIB_1.sinto.fragments.tsv.gz'

dataset = pbm.Dataset(name='hsapiens_gene_ensembl',  host='http://www.ensembl.org')
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT')
annot = annot[~filter]
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1')
annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']

##################################################

fragments_dict = { 
    args.sampleId: args.fragments
    }
path_to_regions = {'Run_1':'/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/MLV/10x/exp/ih/20190425_NextSeq500_10x_scATAC/MLV__4aa2e0__Mouse_liver_ctrl/outs/peaks.bed',
                  'Run_2':'/staging/leuven/stg_00002/lcb/lcb_projects/MLV/cellranger_atac/NovaSeq6000_20200730/MLV__0d3236__liver_fresh_07_07_2020/outs/peaks.bed'}