def create_align_db(): alignments = dict() human_db = pyensembl.EnsemblRelease(86, 'human') mouse_db = pyensembl.EnsemblRelease(86, 'mouse') homology = pandas.read_csv("mart_export.txt") homology = homology[~pandas.isnull(homology['Mouse gene stable ID'])] homologs = dict( zip(homology['Mouse gene stable ID'], homology['Gene stable ID'])) mutations = pandas.read_csv( "/Users/charlesmurphy/Desktop/Research/0914_hui/results/Mutations/Varscan-paired/somatic.nodbsnp.varscan.filtered.primaryControl.csv" ) for i in mutations.index: #get the canonical mouse transcript mouse_transcript = mutations.loc[i, 'TRANSCRIPT'].split('.')[0] mouse_transcript = mouse_db.transcript_by_id(mouse_transcript) # determine if there is a human homolog if mouse_transcript.gene_id in homologs: human_gene = human_db.gene_by_id( homologs[mouse_transcript.gene_id]) #get longest CDS transcript max_length = 0 human_canonical_transcript = None for human_transcript in human_gene.transcripts: if human_transcript.protein_sequence is not None and len( human_transcript.protein_sequence) > max_length: human_canonical_transcript = human_transcript max_length = len(human_transcript.protein_sequence) # just skip ENST00000589042 because it is really long, and I know # none of mouse tumors have mutations in it if human_canonical_transcript is None or human_canonical_transcript.id == 'ENST00000589042': continue # align the protein sequences if human_canonical_transcript.id + '-' + mouse_transcript.id not in alignments: print human_canonical_transcript.id + '-' + mouse_transcript.id, len( human_canonical_transcript.protein_sequence), len( mouse_transcript.protein_sequence) alignments[human_canonical_transcript.id + '-' + mouse_transcript.id] = align( human_canonical_transcript.id, human_canonical_transcript.protein_sequence, mouse_transcript.id, mouse_transcript.protein_sequence) pickle.dump(alignments, open('alignments.p', 'wb'))
def set_genome(params): if params['genome'] == "GRCh38": pyensembl_data = pyensembl.EnsemblRelease(84, 'human') params['grch38_color'] = "lightgrey" elif params['genome'] == "GRCh37": pyensembl_data = pyensembl.EnsemblRelease(75, 'human') params['grch37_color'] = "lightgrey" else: pyensembl_data = pyensembl.EnsemblRelease(84, 'mouse') params['gene5prime'] = params['gene5prime'].capitalize() params['gene3prime'] = params['gene3prime'].capitalize() params['grcm38_color'] = "lightgrey" return params, pyensembl_data
def process_data(species, release, genome, agfusion): pyens_db = pyensembl.EnsemblRelease(release, species) db = sqlite3.Connection(agfusion) c = db.cursor() # process_gene_synonym(species, release, pyens_db, c) # process_gene_data(species, release, pyens_db, c) upload_fasta(species, genome, release)
def isExonic(ensemblRelease, chrom, pos): ensembl = pyensembl.EnsemblRelease(release=ensemblRelease) try: exons = ensembl.exons_at_locus(contig=int(chrom), position=int(pos)) except Exception as e: logger.error('exception: ' + str(e)) return None return len(exons) > 0
def gene_to_interval(gene, release=75): data = pyensembl.EnsemblRelease(release) ret = None try: ret = data.genes_by_name(gene)[0] except: sys.stderr.write(" ".join(["Gene not found: ", gene, "\n"])) if ret is not None: return (str(ret.contig), str(ret.start), str(ret.end), str(ret.name)) else: return ret
def load_ensembl_data(release, species): """Load the ensembl data so we can convert gene_ids into gene_names for more intuitive output""" try: ensembl_data = pyensembl.EnsemblRelease(release=release, species=species) except ValueError as error_details: print error_details print "Make sure you have the pyensembl library and annotation release \ you want to use downloaded and properly installed. e.g.:" print "$ pip install pyensembl" print "$ pyensembl install --release 85 --species mouse" sys.exit(1) return ensembl_data
def getGenesForVariant(variant, ensemblRelease, geneOfInterest): ensembl = pyensembl.EnsemblRelease(release=ensemblRelease) chrom = variant[0] if type(chrom) is str and 'chr' in chrom: chrom = chrom.split('chr')[1] pos = variant[1] try: #exons = ensembl.exons_at_locus(contig=int(chrom), position=int(pos)) genes = ensembl.gene_names_at_locus(contig=chrom, position=int(pos)) g_of_i = set() g_of_i.add(geneOfInterest) g = set(genes) intersectingGenes = g_of_i.intersection(g) if len(intersectingGenes) != 1: return None else: # this is pythonic way of returning member of singleton set return (intersectingGenes,) except Exception as e: logger.error('exception: ' + str(e)) return None
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species." ) parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument( '-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument( '-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument( '--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument( '--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument( '--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument( '--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument( '--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument( '--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument( '-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument( '-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument( '--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument( '--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument( '--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config)) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format( exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df( micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): """ Main function for processing command line options """ parser = argparse.ArgumentParser( description='Annotate Gene Fusion (AGFusion)') subparsers = parser.add_subparsers(help='AGFusion programs.', dest="subparser_name") annotate_parser = subparsers.add_parser( 'annotate', help='Annotate and visualize a single fusion.') annotate_parser.add_argument('-g5', '--gene5prime', type=str, required=True, help='5\' gene partner') annotate_parser.add_argument('-g3', '--gene3prime', type=str, required=True, help='3\' gene partner') annotate_parser.add_argument( '-j5', '--junction5prime', type=int, required=True, help='Genomic location of predicted fuins for the 5\' gene partner. ' + 'The 1-based position that is the last nucleotide included in ' + 'the fusion before the junction.') annotate_parser.add_argument( '-j3', '--junction3prime', type=int, required=True, help='Genomic location of predicted fuins for the 3\' gene partner. ' + 'The 1-based position that is the first nucleotide included in ' + 'the fusion after the junction.') add_common_flags(annotate_parser) annotate_parser.add_argument( '--scale', type=int, required=False, default=-1, help='(Optional) Set maximum width (in amino acids) of the ' + 'figure to rescale the fusion (default: max length of ' + 'fusion product)') # batch file parser batch_parser = subparsers.add_parser( 'batch', help='Annotate fusions from an output file from a fusion ' + 'finding algorithm.') batch_parser.add_argument( '-f', '--file', type=str, required=True, help='Output file from fusion-finding algorithm.') batch_parser.add_argument( '-a', '--algorithm', type=str, required=True, help='The fusion-finding algorithm. Can be one of the following: ' + ', '.join(agfusion.parsers.keys()) + '.') add_common_flags(batch_parser) # download database database_parser = subparsers.add_parser( 'download', help='Download database for a reference genome.') database_parser.add_argument( '-d', '--dir', type=str, default='', help='(Optional) Directory to the database will be downloaded ' + 'to (defaults to current working directory).') database_parser.add_argument( '-g', '--genome', type=str, default=None, help='Specify the genome shortcut (e.g. hg19). To see all' + 'available shortcuts run \'agfusion download -a\'. Either ' + 'specify this or --species and --release.') database_parser.add_argument('-s', '--species', type=str, default=None, help='The species (e.g. homo_sapiens).') database_parser.add_argument('-r', '--release', type=int, default=None, help='The ensembl release (e.g. 87).') database_parser.add_argument( '-a', '--available', action='store_true', required=False, help='List available species and ensembl releases.') # build database parser build_database_parser = subparsers.add_parser( 'build', help='Build database for a reference genome.') build_database_parser.add_argument( '-d', '--dir', type=str, required=True, help='Directory to write database file to.') build_database_parser.add_argument('-s', '--species', type=str, required=True, help='The species (e.g. homo_sapiens).') build_database_parser.add_argument('-r', '--release', type=int, required=True, help='The ensembl release (e.g. 87).') build_database_parser.add_argument( '--pfam', type=str, required=True, help='File containing PFAM ID mappings.') build_database_parser.add_argument( '--server', type=str, required=False, default='ensembldb.ensembl.org', help='(optional) Ensembl server (default ensembldb.ensembl.org)') # agfusion version number parser.add_argument('-v', '--version', action='version', version=agfusion.__version__) args = parser.parse_args() if args.subparser_name == 'build': builddb(args) exit() elif args.subparser_name == 'download': if args.available: list_available_databases() else: downloaddb(args) exit() # single or batch mode if not exists(args.out): mkdir(args.out) # if user does not specify a sqlite database then use the one provided # by the package db_file = split(args.database)[1] species = db_file.split('.')[1] release = db_file.split('.')[2] assert species in AVAILABLE_ENSEMBL_SPECIES, 'unsupported species!' agfusion_db = agfusion.AGFusionDB(args.database, debug=args.debug) agfusion_db.build = species + '_' + str(release) # get the pyensembl data pyensembl_data = pyensembl.EnsemblRelease(release, species) try: pyensembl_data.db except ValueError: agfusion_db.logger.error( "Missing pyensembl data. Run pyensembl install --release " + "{} --species {}".format(release, species)) exit() # parse the re-coloring and re-naming colors = {} rename = {} if args.rename is not None: for i in args.rename: pair = i.split(';') assert len(pair) == 2, " did not properly specify --rename" if pair[0] in rename: agfusion_db.logger.warn( "WARNING - you rename {} twice.".format(pair[0])) rename[pair[0]] = pair[1] if args.recolor is not None: for i in args.recolor: pair = i.split(';') assert len(pair) == 2, " did not properly specify --colors" if pair[0] in colors: agfusion_db.logger.warn( "You specified colors for {} twice.".format(pair[0])) if pair[0] in rename: colors[rename[pair[0]]] = pair[1] else: colors[pair[0]] = pair[1] # check image file type is valid if args.type not in ['png', 'pdf', 'jpeg']: agfusion_db.logger.error( "ERROR - provided an incorrect image file type: {}.".format( args.type)) exit() if args.subparser_name == 'annotate': annotate(gene5prime=args.gene5prime, junction5prime=args.junction5prime, gene3prime=args.gene3prime, junction3prime=args.junction3prime, agfusion_db=agfusion_db, pyensembl_data=pyensembl_data, args=args, outdir=args.out, colors=colors, rename=rename, scale=args.scale) elif args.subparser_name == 'batch': batch_mode(args, agfusion_db, pyensembl_data, rename, colors)
def get_seq_aminoacid(cfg, din): """ Fetches sequences if mutation format is amino acid :param cfg: configuration dict :param din: input data :returns dsequences: dataframe with sequences """ import pyensembl #import ensembl object that would fetch genes # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease']) ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) din.index = range(len(din)) dbedp = '{}/dbedflank.bed'.format(cfg['datad']) dbed = pd.DataFrame(columns=bed_colns) terrpositions = [] terrnotfound = [] terrnoncoding = [] bedrowi = 0 # for i in trange(len(din)-1,desc='get positions for bedtools'): for i in din.index: if din.loc[i, 'transcript: id'] in ensembl.transcript_ids(): t = ensembl.transcript_by_id(din.loc[i, 'transcript: id']) if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon: coding_sequence_positions = tboundaries2positions(t) if len(coding_sequence_positions) == len(t.coding_sequence): #TODO need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce dcoding = t2pmapper(t, coding_sequence_positions) dcodingmutpos = dcoding.loc[( dcoding['protein index'] == din.loc[ i, 'aminoacid: position']), :] codon_positions = dcodingmutpos[ 'coding sequence positions'].tolist() if len(codon_positions) != 0: dbed.loc[bedrowi, 'chromosome'] = t.contig if cfg['test']: print(din.loc[i, 'transcript: id'], codon_positions) if t.strand == '+': dbed.loc[bedrowi, 'codon start'] = codon_positions[0] dbed.loc[bedrowi, 'codon end'] = codon_positions[2] elif t.strand == '-': dbed.loc[bedrowi, 'codon start'] = codon_positions[2] dbed.loc[bedrowi, 'codon end'] = codon_positions[0] dbed.loc[bedrowi, 'start'] = dbed.loc[ bedrowi, 'codon start'] - 22 #FIXME put flank in the yml dbed.loc[bedrowi, 'end'] = dbed.loc[ bedrowi, 'codon end'] + 21 #FIXME put flank in the yml dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[ 'protein sequence'].tolist()[0] dbed.loc[bedrowi, 'reference codon'] = ''.join( dcodingmutpos['coding sequence'].tolist()) dbed.loc[bedrowi, 'strand'] = t.strand dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format( din.loc[i, 'transcript: id'], dbed.loc[bedrowi, 'chromosome'], dbed.loc[bedrowi, 'strand'], int(dbed.loc[bedrowi, 'start']), int(dbed.loc[bedrowi, 'end'])) dbed.loc[bedrowi, 'gene: id'] = t.gene_id dbed.loc[bedrowi, 'gene: name'] = t.gene.name dbed.loc[bedrowi, 'protein: id'] = t.protein_id dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[ i, 'aminoacid: position'] # break bedrowi += 1 else: terrpositions.append(t.id) else: terrpositions.append(t.id) else: terrnoncoding.append(t.id) else: terrnotfound.append(din.loc[i, 'transcript: id']) if cfg['test']: logging.error('not found: {}'.format( din.loc[i, 'transcript: id'])) if len(dbed) == 0: from beditor.lib.global_vars import saveemptytable logging.warning('no valid seqeunces found; saving an empty table.') saveemptytable(cfg, f"{cfg['dsequencesp']}") return None dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45, axis=1)), :] #FIXME put flank in the yml dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int) dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int) dbed = dbed.drop_duplicates(subset=bed_colns) dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False) err2tids = { 'terrpositions': terrpositions, 'terrnotfound': terrnotfound, 'terrnoncoding': terrnoncoding, } if cfg['test']: print(err2tids) with open(dbedp + '.err.json', 'w') as outfile: json.dump(err2tids, outfile) bedp = f"{cfg['datad']}/dbedflank.bed" fastap = f"{cfg['datad']}/dbedflank.fa" cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}" runbashcmd(cmd) dflankfa = fa2df(fastap, ids2cols=True) dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply( lambda x: x.upper()) dflankfa.loc[:, 'sequence: length'] = [len(s) for s in dflankfa['sequence']] dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index] dflankfa.index.name = 'id' dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1') dseq2compatible = { 'aminoacid: position': 'aminoacid: position', 'gene: id': 'gene: id', 'gene: name': 'gene: name', 'protein: id': 'protein: id', 'transcript: id': 'seqid', 'transcript: sequence': 'sequence', 'aminoacid: wild-type': 'reference residue', 'codon: wild-type': 'reference codon', 'contig': 'contig', 'strand': 'strand', 'start': 'start', 'end': 'end', 'codon start': 'codon start', 'codon end': 'codon end', } if 'amino acid mutation' in dseq: dseq2compatible['amino acid mutation'] = 'amino acid mutation' dseq.to_csv(cfg['dseqtmpp'], sep='\t') dseq = dseq[list(dseq2compatible.values())] dseq.columns = list(dseq2compatible.keys()) # dseq.to_csv('data/dseq.csv') logging.info(dseq.columns.tolist()) logging.info(din.columns.tolist()) dseq = pd.merge(dseq.reset_index(), din, on=['transcript: id', 'aminoacid: position']) logging.info(dseq.columns.tolist()) set_index(dseq, 'id') if 'reverse_mutations' in cfg: if cfg['reverse_mutations']: from beditor.lib.io_dfs import dfswapcols dseq = dfswapcols(dseq, ['aminoacid: wild-type', 'amino acid mutation']) dseq['codon: mutation'] = dseq['codon: wild-type'].copy() dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t') del ensembl
def process_results(comparisons, comparisons_params): db = pyensembl.EnsemblRelease('75', 'human') for directory in comparisons: outdir = directory + '/pval' + str( comparisons_params[directory]['pvalue']) + '_padj' + str( comparisons_params[directory]['padj']) + '_log2FC' + str( comparisons_params[directory]['log2']) print outdir try: genes = pandas.read_table('./' + outdir + '/' + directory + '_up.txt', header=None) except: continue symbols = [] for ii in genes[0].tolist(): try: symbols.append(db.gene_by_id(ii).gene_name) except ValueError: symbols.append(ii) genes['symbols'] = symbols genes.index = genes[0] mapping, raw, bed = parse_result_file('./' + outdir + '/' + directory + '_results_up.txt') raw = get_overlapping_genes(raw, mapping, genes) raw_up = raw[raw['total_genes_in_region'].astype(int) > 3] raw_up['cna'] = ['amplification'] * raw_up.shape[0] try: genes = pandas.read_table('./' + outdir + '/' + directory + '_down.txt', header=None) except: continue symbols = [] for ii in genes[0].tolist(): try: symbols.append(db.gene_by_id(ii).gene_name) except ValueError: symbols.append(ii) genes['symbols'] = symbols genes.index = genes[0] mapping, raw, bed = parse_result_file('./' + outdir + '/' + directory + '_results_down.txt') raw = get_overlapping_genes(raw, mapping, genes) raw_down = raw[raw['total_genes_in_region'].astype(int) > 3] raw_down['cna'] = ['deletion'] * raw_down.shape[0] raw = raw_down.append(raw_up) raw = raw[[ 'chrom', 'start', 'end', 'pvalue', 'padj', 'cna', 'num_differentially_expressed_genes', 'total_genes_in_region', 'differentially_expressed_genes' ]] raw = raw.sort_values('padj') raw.to_excel('./' + outdir + '/' + directory + '_results.xlsx', index=False)
def main(): args = argparser() os.environ['PYENSEMBL_CACHE_DIR'] = args.pyensembl ensembl_object = pyensembl.EnsemblRelease(release=75) # download_ensembl(args, ensembl_object) ## If dbSNP argument is not a file, then it's a column of rsIDs. if not os.path.isfile(args.dbSNP): args.dbSNP = int(args.dbSNP) ## http://colorbrewer2.org/#type=qualitative&scheme=Paired&n=4 cycle_colors = itertools.cycle( (('#a6cee3', '#1f78b4'), ('#b2df8a', '#33a02c'))) pos_init_chrom = 0 pos_prev = 0 fi = fileinput.FileInput( files=args.input, openhook=fileinput.hook_compressed) l_x = [] l_y = [] l_c = [] l_prob = [] x_ticks = [] ## Gene annotations a local maxima. annotations = [] y_max = 0 d_pos_init_chrom = {} for chrom, split_lines in itertools.groupby( split_line(fi), operator.itemgetter(args.chrom)): print('Looping over chromosome', chrom, file=sys.stderr) pos_init_chrom += pos_prev + 1500000 # todo: make argument d_pos_init_chrom[chrom] = pos_init_chrom colors = next(cycle_colors) x_ticks.append((None, None)) pos = 0 for l in split_lines: af = float(l[args.af]) if min(af, 1 - af) < args.threshold_maf: continue pos, pos_prev = int(l[args.pos]), pos ## Assert that input is sorted. assert pos >= pos_prev, (pos, pos_prev) prob = float(l[args.prob]) ref = l[args.ref] alt = l[args.alt] x = pos + pos_init_chrom try: y = -math.log10(prob) except: ## GEMMA seems to print a p_lrt of 0.000000e+00, when the numbers get too small. ## e.g. bilirubin 2:234664586 assert l[args.prob] == '0.000000e+00' y = y+0.01 y_max = max(y, y_max) l_x.append(x) l_y.append(y) l_prob.append(prob) ## Place a chromosome tick halfway through the chromosome basepair range. x_ticks[-1] = (pos_init_chrom + pos / 2, chrom) if prob > args.threshold_p: l_c.append(colors[0]) else: l_c.append(colors[1]) gene_names = ensembl_object.gene_names_at_locus(chrom, pos) gene_ids = ensembl_object.gene_ids_at_locus(chrom, pos) protein_ids = ensembl_object.protein_ids_at_locus(chrom, pos) transcript_ids = ensembl_object.transcript_ids_at_locus( chrom, pos) ## print('protein_ids', protein_ids) ## print('gene_names', gene_names) ## print('gene_ids', gene_ids) ## print('transcript_ids', transcript_ids) #### for gene_id in gene_ids: #### locus = ensembl_object.locus_of_gene_id(gene_id) ###### print(gene_id, locus.start, locus.end, locus.strand) if os.path.isfile(args.dbSNP): rsID = parse_dbSNP(args, chrom, pos, ref, alt) else: rsID = l[args.dbSNP] annotation = { 'chrom': chrom, 'x': x, 'y': y, 'prob': prob, 'pos': pos, 'ref': ref, 'alt': alt, 'rsID': rsID, 'gene_ids': gene_ids, 'gene_names': gene_names, 'af': af, } ## print( ## prob, af, chrom, pos, rsID, ref, alt, ## ','.join(gene_ids), ','.join(gene_names), ## sep='\t', file=sys.stdout) ## Don't append to the cluster, if it doesn't have an rsID. ## Pick something with a lower probability instead then. if not rsID: pass ## No clusters yet. ## Create first cluster. elif not annotations: annotations.append(annotation) ## Not in the vicinity of previous cluster. ## Append new cluster. elif ( chrom != annotations[-1]['chrom'] or pos - annotations[-1]['pos'] > 1000000): # todo: make arg! annotations.append(annotation) ## In the vicinity of previous cluster. ## Probability lower than current local minimum. ## Overwrite previous cluster. elif prob < annotations[-1]['prob']: annotations[-1] = annotation print('annotations', annotations, file=sys.stderr) plot_qq(args, l_y, l_prob) plt.clf() plot_manhattan( args, annotations, l_x, l_y, l_c, x_ticks, y_max, d_pos_init_chrom) return
### # This script reads json files from http://amigo.geneontology.org/amigo and format data into tables for publication ### import os, json # run if first time: pyensembl install --release 99 --species homo_sapiens import pyensembl release = pyensembl.EnsemblRelease() def formatter(enrichment, g): print(enrichment) print() term = enrichment['term']['label'] level = 0 if term != 'UNCLASSIFIED': level = enrichment['term']['level'] background_rank = enrichment['number_in_reference'] found_rank = enrichment['input_list']['number_in_list'] expected_rank = enrichment['input_list']['expected'] fold_enrichment = enrichment['input_list']['fold_enrichment'] sign = enrichment['input_list']['plus_minus'] pvalue = enrichment['input_list']['pValue'] ensembl_ids = enrichment['input_list']['mapped_id_list']['mapped_id'] if isinstance(ensembl_ids, list) == False: ensembl_ids = [ensembl_ids]
#AML #1/6/19 #PyEnsembl tutorial - https://www.hammerlab.org/2015/02/04/exploring-the-genome-with-ensembl-and-python/ #already set up local copy of release 96 import pyensembl import pandas as pd from transcriptGraphing import SpliceVariantPASDiagram, MultiGeneVariantPASDiagram from Bio import SeqIO import numpy as np from scipy.signal import find_peaks #pulls current release version for the human genome ensembl = pyensembl.EnsemblRelease(release='96') #opens human cluster data for given chromosome. Can optionally filter by PAS type def openPASClustersForChromosome(name, pasType='All'): #opening all the true values from PolyASite2.0 colnames = [ "seqName", "start", "end", "clusterID", "avgTPM", "strand", "percentSupporting", "protocolsSupporting", "avgTPM2", "type", "upstreamClusters" ] pas_stuff = pd.read_csv('atlas.clusters.hg38.2-0.bed', delimiter='\t', names=colnames, dtype={"seqName": str}) trueValBoolMask = pas_stuff['seqName'] == name currentTrueVals = pas_stuff[trueValBoolMask] #filtered true vals if pasType == "All":
import pyensembl import sys, os import pandas as pd import utils as bb event_file = sys.argv[1] ensemble_release = int(sys.argv[2]) events_table = pd.read_csv(event_file) ensembl = pyensembl.EnsemblRelease(ensemble_release) gtf = pd.DataFrame( columns=["seqname", "feature", "start", "end", "strand", "attribute"]) for index, row in events_table.iterrows(): try: tid, sep, event_jid = row["effectId"].partition('_') except: continue event_coords = bb.jid_to_coords(event_jid) transcript = ensembl.transcript_by_id(tid) isoform = bb.get_matching_isoform(transcript, event_coords) attribute = 'gene_id ' + transcript.gene_id + '; transcript_id ' + row[ "effectId"] exon_coord = pd.DataFrame(transcript.exon_intervals, columns=["start", "end"]) new_coord = bb.get_new_coord(isoform, event_coords, exon_coord) gtf = gtf.append( { "seqname": transcript.contig, "feature": "transcript", "start": new_coord.start.min(),
from os.path import join, expanduser, curdir, abspath import unittest import agfusion from agfusion import utils import pyensembl from Bio import SeqIO, Seq, Alphabet data = pyensembl.EnsemblRelease(84,'mouse') db = agfusion.AGFusionDB(abspath(join(curdir,'agfusion.mus_musculus.84.db'))) db.build = 'mus_musculus_84' data_human = pyensembl.EnsemblRelease(75,'human') db_human = agfusion.AGFusionDB(abspath(join(curdir,'agfusion.homo_sapiens.75.db'))) db_human.build = 'homo_sapiens_75' class TestSequencePrediction_human(unittest.TestCase): def test_1(self): """ test CDS and prortein correct for junction that is on exon boundaries and produces an out-of-frame protein. """ #test the dna and protein coding sequences are correct by comparing #with manually generally sequences fusion = agfusion.Fusion( gene5prime="TMEM87B", gene5primejunction=112843681, gene3prime="MERTK", gene3primejunction=112722768,
] PPI_triples.gene_symbol_2 = [ str(symbol) for symbol in PPI_triples.gene_symbol_2 ] PPI_edges = df_to_edgelist(PPI_triples.iloc[:, 2:]) # PPI PP = nx.Graph() PP.add_edges_from(PPI_edges) for u, v in PP.edges: if u == v: PP.remove_edge(u, v) # ensembl release 77 esb = pyensembl.EnsemblRelease(77) esb.gene_by_id('ENSG00000148143') # Selected PPI y2h_ht14 = pd.read_csv('./data/PPI/HI-II-14_trim.csv', sep='\t', header=None) y2h_ht14.columns = [ 'uniprot_id_1', 'uniprot_id_2', 'ensembl_anno_1', 'ensembl_anno_2' ] y2h_ht14['ensembl_gene_1'] = [ anno.split('|')[-1].split('.')[0].replace('ensembl:', '') for anno in y2h_ht14.ensembl_anno_1 ] y2h_ht14['ensembl_gene_2'] = [ anno.split('|')[-1].split('.')[0].replace('ensembl:', '') for anno in y2h_ht14.ensembl_anno_2 ]
def convert_mutations(): chroms = [str(i) for i in range(1, 20)] + ['X', 'Y'] # human and mouse homologs homology = pandas.read_csv("mart_export.txt") homology = homology[~pandas.isnull(homology['Mouse gene stable ID'])] homologs = dict( zip(homology['Mouse gene stable ID'], homology['Gene stable ID'])) human_db = pyensembl.EnsemblRelease(86, 'human') mouse_db = pyensembl.EnsemblRelease(86, 'mouse') # alignments alignments = pickle.load(open('alignments.p', 'rb')) alignment_maps = {} for i in alignments.keys(): alignment_maps[i.split('-')[1]] = i mutations = pandas.read_csv( "/Users/charlesmurphy/Desktop/Research/0914_hui/results/Mutations/Varscan-paired/somatic.nodbsnp.varscan.filtered.primaryControl.csv" ) hotspots = pandas.read_table( '/Users/charlesmurphy/Desktop/Research/data/papers/hotspotMutations/060717.hotspots.txt', sep='\t') ishotspot = [] subsitution = [] human_genes = [] for i in mutations.index: #get the mouse transcript and determine if it had an alignment transcript = mutations.loc[i, 'TRANSCRIPT'].split('.')[0] if transcript not in alignment_maps: ishotspot.append('') subsitution.append('') human_genes.append('') continue human_gene = human_db.transcript_by_id( alignment_maps[transcript].split('-')[0]).gene.name human_genes.append(human_gene) AA = mutations.loc[i, 'AMINO ACID CHANGE'] bp_change = mutations.loc[i, 'BASE PAIR CHANGE'] effect = mutations.loc[i, 'EFFECT'] if pandas.isnull(AA) and effect.find('splice') != -1: #only continute if it is an actualy amino acid change #splice site variants don't have any entry for AA, but I manually #checked the list in OncoKB and saw no splice site variants ishotspot.append('') subsitution.append(bp_change + ' splice variant') continue elif pandas.isnull(AA) and effect.find( '5_prime_UTR_premature_start_codon_gain_variant') != -1: ishotspot.append('') subsitution.append( bp_change + ' 5_prime_UTR_premature_start_codon_gain_variant') continue elif AA.find('*') != -1 or AA.find('fs') != -1 or AA == 'p.Met1?': ishotspot.append('') subsitution.append(AA + ' truncating') continue elif pandas.isnull(AA): ishotspot.append('') subsitution.append('') print 'not sure what variant' import pdb pdb.set_trace() continue # get the amino acid position and letter change AA_pos = int(re.findall('[0-9]+', AA)[0]) AA = AA.replace('p.', '') AA = AA.replace(str(AA_pos), '') AA = seq1(AA) # determine if there was an alignment alignment = alignments[alignment_maps[transcript]] if alignment is None: ishotspot.append('') subsitution.append(AA[0] + str(human_pos) + AA[1]) continue # if there was an alignment, get the amino acid position in the human gene mouse_pos = 0 mouse_index = 0 mouse_aa = '' for aa in alignment[1]: mouse_index += 1 if aa == 'X': continue if aa != '-': mouse_pos += 1 if mouse_pos == AA_pos: mouse_aa = alignment[1][mouse_index - 1] break if mouse_aa != AA[0]: print 'boo' import pdb pdb.set_trace() human_pos = 0 human_index = 0 human_aa = '' for aa in alignment[0]: human_index += 1 if aa == 'X': continue if aa != '-': human_pos += 1 if human_index == mouse_index: human_aa = alignment[0][human_index - 1] break subsitution.append(AA[0] + str(human_pos) + AA[1]) # determine if it is in a hotspot hotspots_tmp = hotspots[hotspots['Gene'] == human_gene] if hotspots_tmp.shape[0] > 0: positions = map(lambda x: int(re.findall('[0-9]+', x)[0]), hotspots_tmp['Residue'].tolist()) positions = [human_pos == i for i in positions] if sum(positions) > 0: hotspots_tmp = hotspots_tmp[positions]['Residue'].tolist()[0] ishotspot.append(human_gene + ' (' + hotspots_tmp + ')') else: ishotspot.append('') else: ishotspot.append('') mutations_per_sample = {} samples = mutations.columns.tolist()[10:] for sample in samples: mutations_per_sample[sample] = {} #mutations_per_sample[sample]['genes'] = [] mutations_per_sample[sample]['truncating'] = [] mutations_per_sample[sample]['premature'] = [] mutations_per_sample[sample]['splice'] = [] mutations_per_sample[sample]['missense'] = [] mutations_per_sample[sample]['hotspot'] = [] for i in range(0, mutations.shape[0]): for ss in samples: if mutations.loc[i, ss] != '-': #mutations_per_sample[ss]['genes'].append(human_genes[i]) if ishotspot[i] != '': mutations_per_sample[ss]['hotspot'].append(ishotspot[i]) if subsitution[i].find('truncating') != -1: mutations_per_sample[ss]['truncating'].append( human_genes[i] + ' (' + subsitution[i] + ')') elif subsitution[i].find('splice') != -1: mutations_per_sample[ss]['splice'].append(human_genes[i] + ' (' + subsitution[i] + ')') elif subsitution[i].find( '5_prime_UTR_premature_start_codon_gain_variant' ) != -1: mutations_per_sample[ss]['premature'].append( human_genes[i] + ' (' + subsitution[i] + ')') elif subsitution[i] != '': mutations_per_sample[ss]['missense'].append( human_genes[i] + '-' + subsitution[i]) return mutations_per_sample
from os.path import join, expanduser, curdir, abspath import unittest import agfusion from agfusion import utils import pyensembl from Bio import SeqIO data = pyensembl.EnsemblRelease(84, 'mouse') db = agfusion.AGFusionDB(abspath(join(curdir, 'agfusion.mus_musculus.84.db'))) db.build = 'mus_musculus_84' class TestSequencePrediction(unittest.TestCase): def test_1(self): """ test CDS and cDNA correct for junction that is on exon boundaries and produces an in-frame protein. """ #test the dna and protein coding sequences are correct by comparing #with manually generally sequences fusion = agfusion.Fusion(gene5prime="ENSMUSG00000022770", gene5primejunction=31684294, gene3prime="ENSMUSG00000002413", gene3primejunction=39648486, db=db, pyensembl_data=data, protein_databases=['pfam', 'tmhmm'], noncanonical=True)
self._draw_main_body() self.ax.axis('off') self.ax.set_xlim(0, 1) self.ax.set_ylim(0, 1) peptides = [[60, 24], [84, 9], [103, 11], [119, 11], [171, 9], [298, 13], [445, 20], [528, 8], [620, 11], [631, 9], [649, 11], [660, 8], [668, 10], [687, 10], [740, 16], [770, 8], [2330, 8], [2476, 13]] agfusion_db = AGFusionDB( '/Users/charlesmurphy/Desktop/Research/0914_hui/results/Fusions/plots/agfusion.mus_musculus.84.db', debug=False) agfusion_db.build = 'mus_musculus' + '_' + str(84) pyensembl_data = pyensembl.EnsemblRelease(84, 'mus_musculus') fusion = Fusion(gene5prime=['ENSMUSG00000030849'], gene5primejunction=130167703, gene3prime=['ENSMUSG00000055322'], gene3primejunction=74016186, db=agfusion_db, pyensembl_data=pyensembl_data, protein_databases=['pfam', 'tmhmm'], noncanonical=False) pplot = PlotFusionProtein( filename='FGFR2-TNS1.png', width=10, height=3, dpi=90,
def get_genomes(cfg): """ Installs genomes :param cfg: configuration dict """ runbashcmd( f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}" ) import pyensembl ensembl = pyensembl.EnsemblRelease( species=pyensembl.species.Species.register(latin_name=cfg['host'], synonyms=[cfg['host']], reference_assemblies={ cfg['genomeassembly']: (cfg['genomerelease'], cfg['genomerelease']), }), release=cfg['genomerelease']) contig_mito = ['MTDNA', 'MITO', 'MT'] contigs = [ c for c in ensembl.contigs() if ((not '.' in c) and (len(c) < 5) and (c not in contig_mito)) ] if len(contigs) == 0: logging.error('no contigs identified by pyensembl; aborting') sys.exit(0) logging.info(f"{len(contigs)} contigs/chromosomes in the genome") logging.info(contigs) # raw genome next if 'human' in cfg['host'].lower(): cfg['host'] = 'homo_sapiens' if 'yeast' in cfg['host'].lower(): cfg['host'] = 'saccharomyces_cerevisiae' host_ = "_".join(s for s in cfg['host'].split('_')).capitalize() ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format( cfg['genomerelease'], cfg['host']) genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad) cfg['genomep'] = '{}/genome.fa'.format(genome_fastad) if not exists(cfg['genomep']): logging.error('not found: {}'.format(cfg['genomep'])) if not '/test_beditor/' in cfg['cfgp']: ifdlref = input( "Download genome at {}?[Y/n]: ".format(genome_fastad)) else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes for contig in contigs: if 'GRCh37' in cfg['genomeassembly']: #Homo_sapiens.GRCh37.75.dna_sm.chromosome.1.fa.gz fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.{cfg['genomerelease']}.dna_sm.chromosome.{contig}.fa.gz" else: fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.dna_sm.chromosome.{contig}.fa.gz" fp = '{}/{}'.format(ensembl_fastad, fn) if not exists(fp): cmd = 'wget -q -x -nH ftp://ftp.ensembl.org/{} -P {}'.format( fp, dirname(realpath(__file__))) runbashcmd(cmd, test=cfg['test']) # break # make the fa ready if not exists(cfg['genomep']): cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format( genome_fastad, genome_fastad, genome_fastad) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) if not exists(cfg['genomep'] + '.bwt'): cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('bwa index is present') if not exists(cfg['genomep'] + '.fai'): cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('samtools index is present') if not exists(cfg['genomep'] + '.sizes'): cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'], cfg['genomep']) runbashcmd(cmd, test=cfg['test']) else: logging.info('sizes of contigs are present') ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'], cfg['host']) genome_gff3d = '{}/{}'.format(dirname(realpath(__file__)), ensembl_gff3d) cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d) if not exists(cfg['genomegffp']): logging.error('not found: {}'.format(cfg['genomegffp'])) if not '/test_beditor/' in cfg['cfgp']: ifdlref = input("Download genome annotations at {}?[Y/n]: ".format( genome_gff3d)) else: ifdlref = 'Y' if ifdlref == 'Y': # #FIXME download contigs and cat and get index, sizes fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(), cfg['genomeassembly'], cfg['genomerelease']) fp = '{}/{}'.format(ensembl_gff3d, fn) if not exists(fp): cmd = 'wget -x -nH ftp://ftp.ensembl.org/{} -P {}'.format( fp, dirname(realpath(__file__))) runbashcmd(cmd, test=cfg['test']) # move to genome.gff3 cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp']) runbashcmd(cmd, test=cfg['test']) else: logging.error('abort') sys.exit(1) logging.info('genomes are installed!') return cfg