Ejemplo n.º 1
0
def test_mhc_predictor_error():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    # throws an error for each prediction, make sure vaxrank doesn't fall down
    class FakeMHCPredictor:
        def predict_subsequences(self, x):
            raise ValueError('I throw an error in your general direction')

    epitope_predictions = predict_epitopes(
        mhc_predictor=FakeMHCPredictor(),
        protein_fragment=protein_fragment,
        genome=genome)

    eq_(0, len(epitope_predictions))
Ejemplo n.º 2
0
def main(opts):
    # load ensembl db
    data = EnsemblRelease(95)

    # read in fusion file
    df = pd.read_csv(opts['input'], sep='\t')

    output_list = []
    for ix, row in df.iterrows():
        # extract gene / tx
        gene5 = row["5'_gene"]
        gene3 = row["3'_gene"]
        tx_id5 = row["5'_transcript"]
        tx_id3 = row["3'_transcript"]

        # fetch prot sequence
        tx5 = data.transcript_by_id(tx_id5)
        tx3 = data.transcript_by_id(tx_id3)
        prot5 = tx5.protein_id
        prot3 = tx3.protein_id
        prot_seq5 = tx5.protein_sequence
        prot_seq3 = tx3.protein_sequence

        # append output
        output_list.append([gene5, tx_id5, prot5, prot_seq5])
        output_list.append([gene3, tx_id3, prot3, prot_seq3])

    # save output
    output_df = pd.DataFrame(output_list, columns=['gene', 'transcript_id', 'protein_id', 'protein_sequence'])
    output_df.drop_duplicates(subset=['gene', 'transcript_id', 'protein_id']).to_csv(opts['output'], sep='\t', index=False)
Ejemplo n.º 3
0
 def __init__(self, VCFFileObject, cytoband, mg, es):
     self.VCFFile = VCFFileObject
     self.cytobandDict = cytoband
     self.mg = mg
     self.es = es
     self.GeneToProteinDict = {}
     self.GenomeBuild = self.VCFFile.GenomeBuild
     self.datahg19 = EnsemblRelease(75)
     self.datahg38 = EnsemblRelease(87)
     self.datahg18 = EnsemblRelease(54)
     self.header = self.VCFFile.header
     self.vcf = self.VCFFile.vcf
     self.info_dict = self.VCFFile.info_dict
     self.format_dict = self.VCFFile.format_dict
     self.organism = "H**o Sapiens"
     self.snpLink = "None"
     self.referenceBuild = 75
     self.server = 'http://rest.ensembl.org'
     self.geneName = "None"
     self.ENSG = "None"
     self.geneLink = "None"
     self.cytoband = "None"
     self.mapping_url = 'http://www.uniprot.org/mapping/'
     self.uniprot_url = 'http://www.uniprot.org/uniprot/'
     self.proteinName = "None"
     self.UniProtLink = "None"
     self.genotype_format = {}
     self.vcf_info = {}
     self.bulk_action = []
Ejemplo n.º 4
0
def ENSEMBLID_to_geneSymbol(ENSEMBL, Ensembl_Release=75):
    data = EnsemblRelease(Ensembl_Release)
    if type(ENSEMBL) is list:
        Genes = list(map(data.gene_name_of_gene_id, ENSEMBL))
    else:
        Genes = data.gene_name_of_gene_id(ENSEMBL)
    return Genes
Ejemplo n.º 5
0
def random_variants(count,
                    ensembl_release=MAX_ENSEMBL_RELEASE,
                    deletions=True,
                    insertions=True,
                    random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    while len(variants) < count:
        transcript_id = rng.choice(transcript_ids)
        transcript = ensembl.transcript_by_id(transcript_id)

        if not transcript.complete:
            continue

        exon = rng.choice(transcript.exons)
        base1_genomic_position = rng.randint(exon.start, exon.end)
        transcript_offset = transcript.spliced_offset(base1_genomic_position)

        try:
            seq = transcript.sequence
        except ValueError as e:
            logging.warn(e)
            # can't get sequence for non-coding transcripts
            continue

        ref = str(seq[transcript_offset])
        if transcript.on_backward_strand:
            ref = reverse_complement(ref)

        alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

        if insertions:
            nucleotide_pairs = [
                x + y for x in STANDARD_NUCLEOTIDES
                for y in STANDARD_NUCLEOTIDES
            ]
            alt_nucleotides.extend(nucleotide_pairs)
        if deletions:
            alt_nucleotides.append("")
        alt = rng.choice(alt_nucleotides)
        variant = Variant(transcript.contig,
                          base1_genomic_position,
                          ref=ref,
                          alt=alt,
                          ensembl=ensembl)
        variants.append(variant)
    return VariantCollection(variants)
    def __init__(self,
                 release,
                 species,
                 output,
                 best_file=None,
                 alias_file=None,
                 custom_cache=None):

        self.annotation = {}

        self.custom_cache = custom_cache
        self.cache_prefix = None
        self.gen_time = get_date()
        self.release = release
        self.species = species
        self.output = output

        self.best_file = best_file
        self.alias_file = alias_file

        if self.alias_file:
            self.alias = parse_alias_file(self.alias_file)
        else:
            self.alias = defaultdict(set)

        self.data = EnsemblRelease(release, species)
        self.download_pyensembl_cache()
        self.get_domain_cache()

        if self.best_file:
            self.best = parse_best_file(self.best_file)
        else:
            self.best = self.choose_best_transcripts()

        self.build_json()
Ejemplo n.º 7
0
def main():
	ensembl_num = 81 #fetch before running program using command: pyensembl install --release <list of Ensembl release numbers> --species <species-name>
	gen_ref = EnsemblRelease(ensembl_num)
	
	output_file = open(sys.argv[2], "w") #output filename

	firstline = True
	input_file = open(sys.argv[1]).read().split("\r")
	for line in input_file:
		if firstline:
			output_file.write(line + "\r")
			firstline = False
			continue
		parameters = strip_values(line)
		if parameters == "nothing":
			output_file.write(line + "\r")
		else:
			gene_name = gen_ref.gene_names_at_locus(contig=parameters[0], position=parameters[1], end =parameters[2])
			for gene in gene_name[0:len(gene_name)]:
				line = line + "," + gene
			output_file.write(line + "\r")

	output_file.write("Generated on " + time.strftime("%m/%d/%Y") + " with Ensemble Release " + str(ensembl_num) + " and locus_serach.py v1." )

	output_file.close()
Ejemplo n.º 8
0
def test_mhc_predictor_error():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    # throws an error for each prediction, make sure vaxrank doesn't fall down
    class FakeMHCPredictor:
        def predict_subsequences(self, x):
            raise ValueError('I throw an error in your general direction')

    epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(),
                                           protein_fragment=protein_fragment,
                                           genome=genome)

    eq_(0, len(epitope_predictions))
Ejemplo n.º 9
0
 def __init__(self, genome='hg19'):
     if genome == 'hg19':
         self.version = 75
         self.rest_url = "http://grch37.rest.ensembl.org"
     else:
         self.version = 77
         self.rest_url = "http://rest.ensembl.org"
     self.db = EnsemblRelease(self.version)
Ejemplo n.º 10
0
class ScrapeEnsembl():
    ''' 
    '''
    def __init__(self, query, hg_version):
        self.query = query.replace("chr","")
        self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release
        self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object

    
    genome = {"hg19": 75, "hg38": 83}
    
    def get_gene_info(self):
        ''' Get the gene information at a given genomic position
        '''
         
        # check if the input is a genomic position or genomic range
        if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit():

            chrom = int(self.query.split(":")[0])
            pos = int(self.query.split(":")[1])
            gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos)
            if not gene_name:
                msg = " ".join(("No gene found at",self.query,"for genome version",
                                str(self.hg_version)))
                return msg 
            
            gene_info = self.hg.genes_by_name(gene_name[0])
            # gene_info[0].loaction doesn't work, hence the mess below
            gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1]

            gene_info = (gene_info[0].name, gene_info[0].id, 
                         gene_info[0].biotype, gene_location)
            
            return(gene_info)
    
    
    def get_canonical_transcript(self, gene_name):
        ''' Determine and return the canonical transcript of the given gene
        '''
        all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name)
        all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts]
        protein_coding_transcripts = []
        for x in all_transcript_details:
            split_transcript_info = re.split(r"[=,]",str(x))
            transcript = split_transcript_info[1]
            transcript_type = split_transcript_info[9]
            location = split_transcript_info[-1][:-1]
            start = re.split(r"[:-]", location)[1]
            stop = re.split(r"[:-]", location)[2]
            size = int(stop) - int(start)
            if transcript_type == "protein_coding":
                protein_coding_transcripts.append((size,transcript,transcript_type)) 
        
        # sort by size and return the largest protein coding transcript
        if protein_coding_transcripts:    
            canonical_transcript = sorted(protein_coding_transcripts)[-1][1]
            return canonical_transcript
Ejemplo n.º 11
0
    def mapping_id(self):

        esb = EnsemblRelease(77)
        self.data_filtered['ensembl_id_1'] = [
            esb.gene_id_of_protein_id(protein)
            for protein in self.data_filtered.protein1
        ]
        self.data_filtered['ensembl_id_2'] = [
            esb.gene_id_of_protein_id(protein)
            for protein in self.data_filtered.protein2
        ]
Ejemplo n.º 12
0
def _test_db_index(mock_index, db_exists):
    """
    Return True if the GTF database gets created, which should
    be different depending on whether the database already existed.

    Note: we need to mock the reference transcript indexing, as we're
    testing GTF indexing.
    """
    data = EnsemblRelease(54)
    data.db._connect_if_exists = Mock(return_value=db_exists)
    data.db._create_database = Mock()
    data.index(force=False)

    return data.db._create_database.called
Ejemplo n.º 13
0
def main():
    args = get_args()
    #
    n_processes = args.n_processes
    eventalign_filepath = args.eventalign
    summary_filepath = args.summary
    chunk_size = args.chunk_size
    out_dir = args.out_dir
    ensembl_version = args.ensembl
    ensembl_species = args.species
    readcount_min = args.readcount_min
    readcount_max = args.readcount_max
    resume = args.resume
    genome = args.genome

    customised_genome = args.customised_genome
    if customised_genome and (None in [
            args.reference_name, args.annotation_name, args.gtf_path_or_url,
            args.transcript_fasta_paths_or_urls
    ]):
        print(
            'If you have your own customised genome not in Ensembl, please provide the following'
        )
        print('- reference_name')
        print('- annotation_name')
        print('- gtf_path_or_url')
        print('- transcript_fasta_paths_or_urls')
    else:
        reference_name = args.reference_name
        annotation_name = args.annotation_name
        gtf_path_or_url = args.gtf_path_or_url
        transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls

    misc.makedirs(out_dir)  #todo: check every level.

    # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
    if not args.skip_eventalign_indexing:
        parallel_index(eventalign_filepath, summary_filepath, chunk_size,
                       out_dir, n_processes, resume)

    # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
    if genome:
        if customised_genome:
            db = Genome(
                reference_name=reference_name,
                annotation_name=annotation_name,
                gtf_path_or_url=gtf_path_or_url,
                transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls)
            # parse GTF and construct database of genomic features
            db.index()
        else:
            db = EnsemblRelease(
                ensembl_version, ensembl_species
            )  # Default: human reference genome GRCh38 release 91 used in the ont mapping.
        parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes,
                                 readcount_min, readcount_max, resume)

    else:
        parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes,
                               readcount_min, readcount_max, resume)
Ejemplo n.º 14
0
def main():
    args = get_args()
    #
    n_processes = args.n_processes        
    eventalign_filepath = args.eventalign
    summary_filepath = args.summary
    out_dir = args.out_dir
    ensembl_version = args.ensembl
    ensembl_species = args.species
    readcount_min = args.readcount_min    
    readcount_max = args.readcount_max
    resume = args.resume
    genome = args.genome


    misc.makedirs(out_dir) #todo: check every level.
    
    # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
    eventalign_log_filepath = os.path.join(out_dir,'eventalign.log')
    if not helper.is_successful(eventalign_log_filepath):
        parallel_combine(eventalign_filepath,summary_filepath,out_dir,n_processes,resume)
    
    # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
    if genome:
        ensembl = EnsemblRelease(ensembl_version,ensembl_species) # Default: human reference genome GRCh38 release 91 used in the ont mapping.    
        parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_min,readcount_max,resume)

    else:
        parallel_preprocess_tx(out_dir,n_processes,readcount_min,readcount_max,resume)
Ejemplo n.º 15
0
def test_drop_duplicates():
    ensembl = EnsemblRelease(78)
    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
    collection_without_duplicates = VariantCollection(
        variants=[v1, v1, v1_copy, v2])
    assert len(collection_without_duplicates) == 2
Ejemplo n.º 16
0
def _get_annotation(adata, retries=3):
    """Insert meta data into adata.obs."""
    from pyensembl import EnsemblRelease

    data = EnsemblRelease(
        adata.uns["release"],
        adata.uns["species"],
    )
    for _ in range(retries):
        try:
            with patch_datacache():
                data.download(overwrite=False)
                data.index(overwrite=False)
            break
        except TimeoutError:
            pass

    # get ensemble gene coordinate
    genes = []
    for i in adata.var.index.map(lambda x: x.split(".")[0]):
        try:
            gene = data.gene_by_id(i)
            genes.append([
                "chr%s" % gene.contig,
                gene.start,
                gene.end,
                gene.strand,
            ])
        except ValueError:
            genes.append([np.nan, np.nan, np.nan, np.nan])
    old_col = adata.var.columns.values
    adata.var = pd.concat(
        [adata.var, pd.DataFrame(genes, index=adata.var_names)], axis=1)
    adata.var.columns = np.hstack(
        [old_col, np.array(["chr", "start", "end", "strand"])])
def main(opts):
    # read in data
    df = pd.read_csv(opts['input'], sep='\t', header=None, names=['Gene1', 'Break1', 'Gene2', 'Break2'])
    tx = pd.read_csv(opts['transcript'], sep='\t')

    # merge in transcript
    rename_dict = {'symbol': 'Gene1', 'Ensembl_nuc': 'Transcript1'}
    df = pd.merge(df, tx.rename(columns=rename_dict)[['Gene1', 'Transcript1']],
                  on='Gene1', how='left')
    rename_dict = {'symbol': 'Gene2', 'Ensembl_nuc': 'Transcript2'}
    df = pd.merge(df, tx.rename(columns=rename_dict)[['Gene2', 'Transcript2']],
                  on='Gene2', how='left')

    # remove the transcript version number
    tmp1 = df['Transcript1'].str.split('.', expand=True)[0]
    tmp2 = df['Transcript2'].str.split('.', expand=True)[0]
    df['Transcript1'] = tmp1
    df['Transcript2'] = tmp2

    # merge in custom transcript
    custom_tx = pd.read_csv(opts['custom'], sep='\t')
    if len(custom_tx):
        # merge custom tx
        rename_dict = {'gene': 'Gene1', 'transcript_id': 'custom_transcript1'}
        df = pd.merge(df, custom_tx.rename(columns=rename_dict)[['Gene1', 'custom_transcript1']],
                      on='Gene1', how='left')
        rename_dict = {'gene': 'Gene2', 'transcript_id': 'custom_transcript2'}
        df = pd.merge(df, custom_tx.rename(columns=rename_dict)[['Gene2', 'custom_transcript2']],
                      on='Gene2', how='left')

        # replace mane tx with custom tx
        is_not_null = ~df['custom_transcript1'].isnull()
        df.loc[is_not_null, 'Transcript1'] = df.loc[is_not_null, 'custom_transcript1']
        is_not_null = ~df['custom_transcript2'].isnull()
        df.loc[is_not_null, 'Transcript2'] = df.loc[is_not_null, 'custom_transcript2']

    # Check transcripts
    data = EnsemblRelease(95)

    # figure out the appropriate canonical tx
    output_list = []
    for ix, row in df.iterrows():
        # get transcript
        mytx1 = pick_transcript(data, row['Gene1'], row['Break1'], row['Transcript1'])
        mytx2 = pick_transcript(data, row['Gene2'], row['Break2'], row['Transcript2'])

        # fill in whether to replace gene ID with transcript ID
        mygene1 = mytx1 if mytx1 else row['Gene1']
        mygene2 = mytx2 if mytx2 else row['Gene2']

        # append output
        output_list.append([mygene1, row['Break1'], mygene2, row['Break2']])

    # save output file
    with open(opts['output'], 'w') as whandle:
        mywriter = csv.writer(whandle, delimiter='\t', lineterminator='\n')
        mywriter.writerows(output_list)
def ensembl_to_sym2(df):
    from pyensembl import EnsemblRelease
    data = EnsemblRelease(87)
    ensml_ids = df.index.tolist()
    sym_gene_list = []
    ensembl_to_sym_dict = {}
    no_ensembl_list = []
    for e in ensml_ids:
        try:
            g = data.gene_by_id(e)
        except ValueError:
            no_ensembl_list.append(e)
        if g.gene_name.strip() not in sym_gene_list:
            ensembl_to_sym_dict[e] = g.gene_name.strip()
            sym_gene_list.append(g.gene_name.strip())
    df.drop(no_ensembl_list, inplace=True)
    renamed_df = df.rename(index=ensembl_to_sym_dict)
    renamed_df_grouped = renamed_df.groupby(renamed_df.index).first()
    numeric_df = renamed_df_grouped.convert_objects(convert_numeric=True)
    return(numeric_df)
Ejemplo n.º 19
0
    def __init__(self,
                 bounds,
                 orients,
                 res=10000,
                 default_chrom='chrN',
                 species='human',
                 release=97,
                 filter_=None):

        ref = EnsemblRelease(release, species=species)
        ref.download()
        ref.index()
        self.ref = ref
        if not filter_ is None:
            # a pre-defined gene list, such as cancer-related genes
            self.filter_list = set([g.upper() for g in filter_])
        else:
            self.filter_list = None

        self.blocks = get_blocks(bounds, orients, res)

        self.chrom_name = default_chrom

        self.map_genes()
        self.file_handler = io.StringIO('\n'.join(
            ['\t'.join(list(map(str, g))) for g in self.genes]))
Ejemplo n.º 20
0
 def get_ensembl_db( sp, annotation_version ):
     
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading and indexing the Ensembl' +
                                  ' database release ' + str( annotation_version ) + 
                                  ' for ' + sp + '.' )
     
     ensembl_db = EnsemblRelease( release = annotation_version,
                                  species = sp )
     
     # Download and index the database if not yet in the temporary folder
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading the Ensembl' +
                                  ' database release ' + str( annotation_version) + 
                                  ' for ' + sp + '.' )
     try:
         ensembl_db.download()
     except Exception as e:
         raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' +
                                    ' download the Ensembl database using pyensembl.', e )
         
     
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Indexing the Ensembl' +
                                  ' database release ' + str( annotation_version) + 
                                  ' for ' + sp + '.' )
     try:
         ensembl_db.index()
     except Exception as e:
         raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' +
                                    ' index the Ensembl database using pyensembl.', e )
     
     return ensembl_db
Ejemplo n.º 21
0
def main(opts):
    # load ensembl db
    data = EnsemblRelease(95)

    output_list = []
    column_list = None
    pattern = os.path.join(opts['input_dir'], '*')
    for d in glob.glob(pattern):
        # figure out break points
        mybase = os.path.basename(d)
        break1 = mybase.split('_')[0].split('-')[-1]
        break2 = mybase.split('_')[1].split('-')[-1]

        # read fasta
        prot_fa_paths = glob.glob(os.path.join(d, '*_protein.fa'))
        if prot_fa_paths:
            prot_fa_path = prot_fa_paths[0]
            p_id, tx_id, seq = read_fasta(prot_fa_path)
        else:
            p_id, tx_id, seq = '', '', ''

        # read in fusion info
        fus_info_path = glob.glob(os.path.join(d, '*fusion_transcripts.csv'))[0]
        with open(fus_info_path) as handle:
            myreader = csv.reader(handle, delimiter=',')
            tmp_columns = next(myreader)
            if column_list is None:
                column_list = tmp_columns
            tmp_list = next(myreader)

        # figure out the pos of the break
        codon_pos1, relative_pos1, prot_len1 = get_cds_pos(data, tmp_list[2], int(break1))
        codon_pos2, relative_pos2, prot_len2 = get_cds_pos(data, tmp_list[3], int(break2))
        # figure out the chromosome
        chrom1 = get_chrom(data, tmp_list[2])
        chrom2 = get_chrom(data, tmp_list[3])

        # append results
        output_list.append(tmp_list + [p_id+':'+break1+"-"+break2, tx_id, seq,
                                       chrom1, break1, chrom2, break2, codon_pos1, codon_pos2,
                                       relative_pos1, relative_pos2, prot_len1, prot_len2])

    # merge results
    mycols = column_list+['ID', 'TX_ID', 'protein_sequence', 'chrom1', 'Break1', 'chrom2', 'Break2', 'CodonPos1',
                          'CodonPos2', 'RelativePos1', 'RelativePos2', 'ProtLen1', 'ProtLen2']
    output_df = pd.DataFrame(output_list, columns=mycols)

    # add gene ID
    output_df['GENE_ID'] = output_df["5'_gene"] + '--' + output_df["3'_gene"]

    # save results
    output_df.to_csv(opts['output'], sep='\t', index=False)
Ejemplo n.º 22
0
def test_reference_peptide_logic():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    epitope_predictions = predict_epitopes(
        mhc_predictor=RandomBindingPredictor(["H-2-Kb"]),
        protein_fragment=protein_fragment,
        genome=genome)

    # occurs in protein ENSMUSP00000033506
    prediction_occurs_in_reference = epitope_predictions[('NCDESLLAS',
                                                          'H-2-Kb')]
    prediction_does_not_occur_in_reference = epitope_predictions[('LDVIVNCDE',
                                                                  'H-2-Kb')]
    ok_(prediction_occurs_in_reference.occurs_in_reference)
    ok_(not prediction_does_not_occur_in_reference.occurs_in_reference)

    # construct a simple vaccine peptide having these two predictions, which makes it easy to check
    # for mutant/WT scores from single contributors
    vaccine_peptide = VaccinePeptide(protein_fragment, [
        prediction_occurs_in_reference, prediction_does_not_occur_in_reference
    ])

    eq_(prediction_occurs_in_reference.logistic_epitope_score(),
        vaccine_peptide.wildtype_epitope_score)
    eq_(prediction_does_not_occur_in_reference.logistic_epitope_score(),
        vaccine_peptide.mutant_epitope_score)
Ejemplo n.º 23
0
def fetch_ensembl_release(path=None, release='75'):
    """Get pyensembl genome files"""

    from pyensembl import Genome, EnsemblRelease
    #this call should download the files
    genome = EnsemblRelease(release, species='human')
    genome.download(overwrite=False)
    genome.index(overwrite=False)
    genome.cache_directory_path = path
    print('pyensembl genome files cached in %s' % genome.cache_directory_path)
    #run_pyensembl_install()
    return
Ejemplo n.º 24
0
class Ensembl(object):
    def __init__(self):
        self.db = EnsemblRelease(75)

    def annotate_one_gene(self, location):
        chrom, start, stop = self.parse_location(location)
        return self.db.gene_names_at_locus(chrom, start, stop)

    @staticmethod
    def parse_location(loc):
        start, stop = loc.split('-')
        chrom, start = start.split(':')
        start, stop = int(start), int(stop)
        return chrom, start, stop
Ejemplo n.º 25
0
def test_reference_peptide_logic():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    epitope_predictions = predict_epitopes(
        mhc_predictor=RandomBindingPredictor(["H-2-Kb"]),
        protein_fragment=protein_fragment,
        genome=genome)

    # occurs in protein ENSMUSP00000033506
    prediction_occurs_in_reference = epitope_predictions[('NCDESLLAS', 'H-2-Kb')]
    prediction_does_not_occur_in_reference = epitope_predictions[('LDVIVNCDE', 'H-2-Kb')]
    ok_(prediction_occurs_in_reference.occurs_in_reference)
    ok_(not prediction_does_not_occur_in_reference.occurs_in_reference)

    # construct a simple vaccine peptide having these two predictions, which makes it easy to check
    # for mutant/WT scores from single contributors
    vaccine_peptide = VaccinePeptide(
        protein_fragment,
        [prediction_occurs_in_reference, prediction_does_not_occur_in_reference])

    eq_(prediction_occurs_in_reference.logistic_epitope_score(),
        vaccine_peptide.wildtype_epitope_score)
    eq_(prediction_does_not_occur_in_reference.logistic_epitope_score(),
        vaccine_peptide.mutant_epitope_score)
Ejemplo n.º 26
0
 def parse_ref_exons(self):
     """ Return fasta reference with only the sequences needed"""
     ens_db = EnsemblRelease(75)
     try:
         exons = ens_db.exons_at_locus(self.chrom, self.start, self.stop)
     except ValueError as e:
         # Load pyensembl db
         raise e
     exon_array = np.zeros(self.stop - self.start)
     exon_numbers = self.get_exon_numbers(ens_db, exons[0].gene_name)
     for exobj in exons:
         start = exobj.start - self.start
         stop = exobj.end - self.start
         i = start
         while i < stop:
             exon_array[i] = 1
             i += 1
     # 2:29,448,326-29,448,432 exon 19
     # exon 22 start: 29445210
     # exon 18 end: 29449940
     # intron 19: 29446395-29448326
     # ATI initiation 29446768-29448326
     return exon_array, exon_numbers[(exon_numbers['start'] > self.hist[0][0]) &
                                     (exon_numbers['start'] < self.hist[0][-1])]
Ejemplo n.º 27
0
def gene_Length(list_of_genes, ensembl_id=True, Ensembl_Release=75):
    gn = EnsemblRelease(Ensembl_Release)
    if ensembl_id:
        gene_pos = list(map(gn.locus_of_gene_id, list_of_genes))
        gene_pos_end = [i.end for i in gene_pos]
        gene_pos_start = [i.start for i in gene_pos]

    else:
        gene_pos = list(map(gn.loci_of_gene_names, list_of_genes))
        gene_pos_end = [i[0].end for i in gene_pos]
        gene_pos_start = [i[0].start for i in gene_pos]

    gene_len = np.array(gene_pos_end) - np.array(gene_pos_start)
    return pa.DataFrame({
        'gene_names': list_of_genes,
        "gene_length": gene_len / 1000
    })
Ejemplo n.º 28
0
def fetch_ensembl_release(path=None, release='75'):
    """get pyensembl genome files"""

    from pyensembl import Genome, EnsemblRelease
    if path is not None:
        os.environ['PYENSEMBL_CACHE_DIR'] = path
    #this call should download the files
    genome = EnsemblRelease(release, species='human')
    genome.download()
    genome.index()
    #print ('pyensembl genome files cached in %s' %genome.cache_directory_path)
    return
Ejemplo n.º 29
0
def test_version_too_old_47():
    EnsemblRelease(47)
Ejemplo n.º 30
0
def random_variants(count,
                    ensembl_release=MAX_ENSEMBL_RELEASE,
                    deletions=True,
                    insertions=True,
                    random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    # we should finish way before this loop is over but just in case
    # something is wrong with PyEnsembl we want to avoid an infinite loop
    for _ in range(count * 100):
        if len(variants) < count:
            transcript_id = rng.choice(transcript_ids)
            transcript = ensembl.transcript_by_id(transcript_id)

            if not transcript.complete:
                continue

            exon = rng.choice(transcript.exons)
            base1_genomic_position = rng.randint(exon.start, exon.end)
            transcript_offset = transcript.spliced_offset(
                base1_genomic_position)
            seq = transcript.sequence

            ref = str(seq[transcript_offset])
            if transcript.on_backward_strand:
                ref = reverse_complement(ref)

            alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

            if insertions:
                nucleotide_pairs = [
                    x + y for x in STANDARD_NUCLEOTIDES
                    for y in STANDARD_NUCLEOTIDES
                ]
                alt_nucleotides.extend(nucleotide_pairs)
            if deletions:
                alt_nucleotides.append("")
            alt = rng.choice(alt_nucleotides)
            variant = Variant(transcript.contig,
                              base1_genomic_position,
                              ref=ref,
                              alt=alt,
                              ensembl=ensembl)
            variants.append(variant)
        else:
            return VariantCollection(variants)
    raise ValueError(("Unable to generate %d random variants, "
                      "there may be a problem with PyEnsembl") % count)
Ejemplo n.º 31
0
def random_variants(
        count,
        ensembl_release=MAX_ENSEMBL_RELEASE,
        deletions=True,
        insertions=True,
        random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    # we should finish way before this loop is over but just in case
    # something is wrong with PyEnsembl we want to avoid an infinite loop
    for _ in range(count * 100):
        if len(variants) < count:
            transcript_id = rng.choice(transcript_ids)
            transcript = ensembl.transcript_by_id(transcript_id)

            if not transcript.complete:
                continue

            exon = rng.choice(transcript.exons)
            base1_genomic_position = rng.randint(exon.start, exon.end)
            transcript_offset = transcript.spliced_offset(base1_genomic_position)
            seq = transcript.sequence

            ref = str(seq[transcript_offset])
            if transcript.on_backward_strand:
                ref = reverse_complement(ref)

            alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

            if insertions:
                nucleotide_pairs = [
                    x + y
                    for x in STANDARD_NUCLEOTIDES
                    for y in STANDARD_NUCLEOTIDES
                ]
                alt_nucleotides.extend(nucleotide_pairs)
            if deletions:
                alt_nucleotides.append("")
            alt = rng.choice(alt_nucleotides)
            variant = Variant(
                transcript.contig,
                base1_genomic_position,
                ref=ref,
                alt=alt,
                ensembl=ensembl)
            variants.append(variant)
        else:
            return VariantCollection(variants)
    raise ValueError(
        ("Unable to generate %d random variants, "
         "there may be a problem with PyEnsembl") % count)
Ejemplo n.º 32
0
def test_str_version():
    for version in range(54, MAX_ENSEMBL_RELEASE):
        EnsemblRelease(str(version))
Ejemplo n.º 33
0
def random_variants(
        count,
        ensembl_release=MAX_ENSEMBL_RELEASE,
        deletions=True,
        insertions=True,
        random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    while len(variants) < count:
        transcript_id = rng.choice(transcript_ids)
        transcript = ensembl.transcript_by_id(transcript_id)

        if not transcript.complete:
            continue

        exon = rng.choice(transcript.exons)
        base1_genomic_position = rng.randint(exon.start, exon.end)
        transcript_offset = transcript.spliced_offset(base1_genomic_position)

        try:
            seq = transcript.sequence
        except ValueError as e:
            logging.warn(e)
            # can't get sequence for non-coding transcripts
            continue

        ref = str(seq[transcript_offset])
        if transcript.on_backward_strand:
            ref = reverse_complement(ref)

        alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

        if insertions:
            nucleotide_pairs = [
                x + y
                for x in STANDARD_NUCLEOTIDES
                for y in STANDARD_NUCLEOTIDES
            ]
            alt_nucleotides.extend(nucleotide_pairs)
        if deletions:
            alt_nucleotides.append("")
        alt = rng.choice(alt_nucleotides)
        variant = Variant(
            transcript.contig,
            base1_genomic_position,
            ref=ref,
            alt=alt,
            ensembl=ensembl)
        variants.append(variant)
    return VariantCollection(variants)
Ejemplo n.º 34
0
        genes_20_saliva.union(mygenes)

len(mygenes)
print mygenes
print len(genes_20_blood.intersection(genes_20_saliva))
print len(genes_20_blood.union(genes_20_saliva))
print list(genes_20_blood) + list(genes_20_saliva)
from matplotlib_venn import venn2
venn2([genes_20_blood, genes_20_saliva], set_labels=('genes_20_blood', 'genes_20_saliva') )
plt.savefig("/mnt/xfs1/home/asalomatov/Blood_vs_Saliva_genes_lt_20.png")
plt.close()


### annotate with genes
from pyensembl import EnsemblRelease
data = EnsemblRelease(75)
data.gene_names_at_locus(contig=1, position=100000)
df_exome['gene_name'] = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=x['start']))
x = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=(x['start'] + x['end'])/2),
        axis=1)
x

df_exome.head()
df_exome.tail()
df_exome.shape
df_exome.bin.value_counts()
df_exome.isnull().sum()

my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")
Ejemplo n.º 35
0
genes_20_blood = set()
genes_20_saliva = set()
import pandas as pd
import numpy as np
import os
from pyensembl import EnsemblRelease
data = EnsemblRelease(75)

df_descr = pd.read_csv("/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/BloodVsSaliva_Descr.csv") 
df_descr.head()
for c in df_descr.columns:
    df_descr[c] = df_descr[c].map(str.strip)

df_descr['fam'] = df_descr['fam_id'].apply(lambda x: x.split("-")[0])
df_descr['protocol'] = df_descr['fam_id'].apply(lambda x: x.split("-")[1])

for smpl in df_descr['smpl_id']:
    print smpl
   # fam = df_descr['fam_id'][df_descr.smpl_id.isin([smpl])].iloc[0]
   # rel = df_descr['relationship'][df_descr.smpl_id.isin([smpl])].iloc[0]
    prot = df_descr['protocol'][df_descr.smpl_id.isin([smpl])].iloc[0]
    print prot
   # desc = '_'.join([fam, rel])
   # print desc
    fname='/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/output/'+smpl+'-D1.final-exome-cvrg.txt'
    if not os.path.isfile(fname):
        print fname
        continue
    df_exome = pd.read_csv(fname, sep="\t", header=None) 
    df_exome = df_exome[df_exome[0] != 'all']
    df_exome.columns = ['chr', 'start', 'end', 'bin', 'length', 'ex_length', 'perc']
Ejemplo n.º 36
0
def test_version_is_not_numeric():
    EnsemblRelease("wuzzle")
Ejemplo n.º 37
0
import os
import sys
import pickle
import matplotlib.pyplot as plt
import numpy as np
import h5py
import argparse
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from matplotlib.colors import Normalize
sys.path.insert(0, os.getcwd())
from src.utils.helpers import *
import statsmodels.stats.multitest as smm
from gprofiler import GProfiler
from pyensembl import EnsemblRelease
data = EnsemblRelease(77)
import multiprocess as mp
from tqdm import tqdm
import time
from pebble import ProcessPool, ProcessExpired
from concurrent.futures import TimeoutError

GTEx_directory = '/hps/nobackup/research/stegle/users/willj/GTEx'

parser = argparse.ArgumentParser(description='Collection of experiments. Runs on the cluster.')
parser.add_argument('-g', '--group', help='Experiment group', required=True)
parser.add_argument('-n', '--name', help='Experiment name', required=True)
parser.add_argument('-p', '--params', help='Parameters')
args = vars(parser.parse_args())
group = args['group']
name = args['name']
Ejemplo n.º 38
0
def test_version_is_none():
    EnsemblRelease(None)
Ejemplo n.º 39
0
	else:
		return int(chr_str) - 1

def find_chromosome(bp):
	curr = 0
	idx = None
	for ch, sz in enumerate(CHROMOSOME_SIZES):
		curr += sz
		if curr >= bp:
			idx = ch
			break
	if idx != None:
		return idx_to_chromosome(idx)

# release 76 uses human reference genome GRCh38
ENSEMBL_DATA = EnsemblRelease(76)

def getMockAnnotations():
	return json.loads(open("mockAnnotations.json", "r").read())

def getMockData():
	return json.loads(open("mockData.json", "r").read())

app = Flask(__name__)
CORS(app)

@app.route("/graphs")
def graphs():
	return json.dumps(["ld_score"])

@app.route("/track_info")
Ejemplo n.º 40
0
def test_version_too_old_1():
    EnsemblRelease(1)
Ejemplo n.º 41
0
 def __init__(self, query, hg_version):
     self.query = query.replace("chr","")
     self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release
     self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object
Ejemplo n.º 42
0
#         http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pyensembl import EnsemblRelease
from varcode import (ExonicSpliceSite, Substitution, Variant,
                     TranscriptMutationEffect)
import pandas as pd

from . import data_path

ensembl = EnsemblRelease(75)


def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position,
                                 dna_ref, dna_alt, aa_pos, aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
Ejemplo n.º 43
0
 def __init__(self):
     self.db = EnsemblRelease(75)