コード例 #1
0
def create_align_db():

    alignments = dict()
    human_db = pyensembl.EnsemblRelease(86, 'human')
    mouse_db = pyensembl.EnsemblRelease(86, 'mouse')

    homology = pandas.read_csv("mart_export.txt")
    homology = homology[~pandas.isnull(homology['Mouse gene stable ID'])]
    homologs = dict(
        zip(homology['Mouse gene stable ID'], homology['Gene stable ID']))

    mutations = pandas.read_csv(
        "/Users/charlesmurphy/Desktop/Research/0914_hui/results/Mutations/Varscan-paired/somatic.nodbsnp.varscan.filtered.primaryControl.csv"
    )
    for i in mutations.index:

        #get the canonical mouse transcript
        mouse_transcript = mutations.loc[i, 'TRANSCRIPT'].split('.')[0]
        mouse_transcript = mouse_db.transcript_by_id(mouse_transcript)

        # determine if there is a human homolog

        if mouse_transcript.gene_id in homologs:
            human_gene = human_db.gene_by_id(
                homologs[mouse_transcript.gene_id])

            #get longest CDS transcript

            max_length = 0
            human_canonical_transcript = None

            for human_transcript in human_gene.transcripts:
                if human_transcript.protein_sequence is not None and len(
                        human_transcript.protein_sequence) > max_length:
                    human_canonical_transcript = human_transcript
                    max_length = len(human_transcript.protein_sequence)

            # just skip ENST00000589042 because it is really long, and I know
            # none of mouse tumors have mutations in it

            if human_canonical_transcript is None or human_canonical_transcript.id == 'ENST00000589042':
                continue

            # align the protein sequences

            if human_canonical_transcript.id + '-' + mouse_transcript.id not in alignments:
                print human_canonical_transcript.id + '-' + mouse_transcript.id, len(
                    human_canonical_transcript.protein_sequence), len(
                        mouse_transcript.protein_sequence)
                alignments[human_canonical_transcript.id + '-' +
                           mouse_transcript.id] = align(
                               human_canonical_transcript.id,
                               human_canonical_transcript.protein_sequence,
                               mouse_transcript.id,
                               mouse_transcript.protein_sequence)

    pickle.dump(alignments, open('alignments.p', 'wb'))
コード例 #2
0
def set_genome(params):

    if params['genome'] == "GRCh38":
        pyensembl_data = pyensembl.EnsemblRelease(84, 'human')
        params['grch38_color'] = "lightgrey"
    elif params['genome'] == "GRCh37":
        pyensembl_data = pyensembl.EnsemblRelease(75, 'human')
        params['grch37_color'] = "lightgrey"
    else:
        pyensembl_data = pyensembl.EnsemblRelease(84, 'mouse')
        params['gene5prime'] = params['gene5prime'].capitalize()
        params['gene3prime'] = params['gene3prime'].capitalize()
        params['grcm38_color'] = "lightgrey"

    return params, pyensembl_data
コード例 #3
0
def process_data(species, release, genome, agfusion):
    pyens_db = pyensembl.EnsemblRelease(release, species)
    db = sqlite3.Connection(agfusion)
    c = db.cursor()

    # process_gene_synonym(species, release, pyens_db, c)
    # process_gene_data(species, release, pyens_db, c)
    upload_fasta(species, genome, release)
コード例 #4
0
def isExonic(ensemblRelease, chrom, pos):
    ensembl = pyensembl.EnsemblRelease(release=ensemblRelease)
    try:
        exons = ensembl.exons_at_locus(contig=int(chrom), position=int(pos))
    except Exception as e:
        logger.error('exception: ' + str(e))
        return None
    return len(exons) > 0
コード例 #5
0
def gene_to_interval(gene, release=75):
    data = pyensembl.EnsemblRelease(release)
    ret = None
    try:
        ret = data.genes_by_name(gene)[0]
    except:
        sys.stderr.write(" ".join(["Gene not found: ", gene, "\n"]))
    if ret is not None:
        return (str(ret.contig), str(ret.start), str(ret.end), str(ret.name))
    else:
        return ret
コード例 #6
0
def load_ensembl_data(release, species):
    """Load the ensembl data so we can convert gene_ids into gene_names for more
    intuitive output"""
    try:
        ensembl_data = pyensembl.EnsemblRelease(release=release,
                                                species=species)

    except ValueError as error_details:
        print error_details
        print "Make sure you have the pyensembl library and annotation release \
you want to use downloaded and properly installed. e.g.:"

        print "$ pip install pyensembl"
        print "$ pyensembl install --release 85 --species mouse"
        sys.exit(1)

    return ensembl_data
コード例 #7
0
def getGenesForVariant(variant, ensemblRelease, geneOfInterest):
    ensembl = pyensembl.EnsemblRelease(release=ensemblRelease)
    chrom = variant[0]
    if type(chrom) is str and 'chr' in chrom:
        chrom = chrom.split('chr')[1]
    pos = variant[1]
    try:
        #exons = ensembl.exons_at_locus(contig=int(chrom), position=int(pos))
        genes = ensembl.gene_names_at_locus(contig=chrom, position=int(pos))
        g_of_i = set()
        g_of_i.add(geneOfInterest)
        g = set(genes)
        intersectingGenes = g_of_i.intersection(g)
        if len(intersectingGenes) != 1:
            return None
        else:
            # this is pythonic way of returning member of singleton set
            return (intersectingGenes,)
    except Exception as e:
        logger.error('exception: ' + str(e))
        return None
コード例 #8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts the differential micropeptides from two "
        "conditions. Please see the documentation in redmine for more details.\n\n"
        "Please see the pyensembl (https://github.com/hammerlab/pyensembl) "
        "documentation for more information about the ensembl release and species."
    )

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name_a', help="The name of the first condition")
    parser.add_argument('name_b', help="The name of the second condition")
    parser.add_argument('out', help="The output (.csv.gz or .xlsx) file")

    parser.add_argument(
        '-a',
        '--append-sheet',
        help="If this flag is given, "
        "then a worksheet with the name '<name_a>,<name_b>' will be appended "
        "to the .xlsx file given by out (if it exists)",
        action='store_true')

    parser.add_argument(
        '-f',
        '--filter',
        help="If this flag is present, then "
        "the output will be filtered to include only the differential "
        "micropeptides with the highest KL-divergence and read coverage",
        action='store_true')

    parser.add_argument(
        '--read-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-filter-percent micropeptides will "
        "be considered for the final output. They still must meet the KL-"
        "divergence filtering criteria.",
        type=float,
        default=default_read_filter_percent)

    parser.add_argument(
        '--kl-filter-percent',
        help="If the the --filter flag "
        "is given, then only the top --read-kl-percent micropeptides will "
        "be considered for the final output. They still must meet the read "
        "coverage filtering criteria.",
        type=float,
        default=default_kl_filter_percent)

    parser.add_argument(
        '--id-matches',
        help="This is a list of files which "
        "contain ORF identifiers to compare to the differential micropeptides. "
        "For each of the files given, two columns will be added to the output "
        "which indicate if either A or B appear in the respective file. Each "
        "file should have a single ORF identifier on each line and contain "
        "nothing else.",
        nargs='*',
        default=default_id_matches)

    parser.add_argument(
        '--id-match-names',
        help="A name to include in the "
        "output file for each --id-matches file. The number of names must "
        "match the number of files.",
        nargs='*',
        default=default_id_match_names)

    parser.add_argument(
        '--overlaps',
        help="This is a list of bed12+ files "
        "which will be compared to the differential micropeptides. Two columns "
        "(one for A, one for B) will be added to the output which indicate if "
        "the respective micropeptides overlap a feature in each file by at "
        "least 1 bp.",
        nargs='*',
        default=default_overlaps)

    parser.add_argument(
        '--overlap-names',
        help="A name to include in the "
        "output file for each --overlaps file. The number of names must match "
        "the number of files.",
        nargs='*',
        default=default_overlap_names)

    parser.add_argument(
        '-r',
        '--ensembl-release',
        help="The version of Ensembl "
        "to use when mapping transcript identifiers to gene identifiers",
        type=int,
        default=default_ensembl_release)

    parser.add_argument(
        '-s',
        '--ensembl-species',
        help="The Ensembl species "
        "to use when mapping transcript identifiers to gene identifiers",
        default=default_ensembl_species)

    parser.add_argument(
        '--a-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_a is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument(
        '--b-is-single-sample',
        help="By default, this script "
        "assumes the predictions come from merged replicates. If name_b is from "
        "a single sample, this flag should be given. It is necessary to find "
        "the correct filenames.",
        action='store_true')

    parser.add_argument('--fields-to-keep',
                        help="The fields to keep from the "
                        "Bayes factor file for each condition",
                        nargs='*',
                        default=default_fields_to_keep)

    parser.add_argument('--max-micropeptide-len',
                        help="The maximum (inclusive) "
                        "length of ORFs considered as micropeptides",
                        type=int,
                        default=default_max_micropeptide_len)

    parser.add_argument(
        '--do-not-fix-tcons',
        help="By default, the \"TCONS_\" "
        "identifiers from StringTie, etc., do not parse correctly; this script "
        "update the identifiers so that will parse correctly unless instructed not "
        "to. The script is likely to crash if the identifiers are not fixed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ensembl database"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl.db

    msg = "Checking the id-match and overlaps files"
    logger.info(msg)

    if len(args.id_matches) != len(args.id_match_names):
        msg = ("The number of --id-matches files and --id-match-names do not "
               "match. {} files and {} names".format(len(args.id_matches),
                                                     len(args.id_match_names)))

        raise ValueError(msg)

    if len(args.overlaps) != len(args.overlap_names):
        msg = ("The number of --overlaps files and --overlaps-names do not "
               "match. {} files and {} names".format(len(args.overlaps),
                                                     len(args.overlap_names)))

        raise ValueError(msg)

    utils.check_files_exist(args.id_matches)
    utils.check_files_exist(args.overlaps)

    if args.filter:
        msg = "Validating filter percentages"
        logger.info(msg)

        math_utils.check_range(args.read_filter_percent,
                               0,
                               1,
                               variable_name="--read-filter-percent")

        math_utils.check_range(args.kl_filter_percent,
                               0,
                               1,
                               variable_name="--kl-filter-percent")

    msg = "Extracting file names"
    logger.info(msg)

    config = yaml.load(open(args.config))

    note_str = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # and the smoothing parameters
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    lengths_a = None
    offsets_a = None

    if args.a_is_single_sample:
        lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_a, is_unique=is_unique)

    bayes_factors_a = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_a):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_a, bayes_factors_a))
        raise FileNotFoundError(msg)

    predicted_orfs_a = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_a,
        length=lengths_a,
        offset=offsets_a,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_a):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_a, predicted_orfs_a))
        raise FileNotFoundError(msg)

    lengths_b = None
    offsets_b = None
    if args.b_is_single_sample:
        lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name_b, is_unique=is_unique)

    bayes_factors_b = filenames.get_riboseq_bayes_factors(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors_b):
        msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.".
               format(args.name_b, bayes_factors_b))
        raise FileNotFoundError(msg)

    predicted_orfs_b = filenames.get_riboseq_predicted_orfs(
        config['riboseq_data'],
        args.name_b,
        length=lengths_b,
        offset=offsets_b,
        is_unique=is_unique,
        note=note_str,
        fraction=fraction,
        reweighting_iterations=reweighting_iterations,
        is_filtered=True,
        is_chisq=False)

    if not os.path.exists(predicted_orfs_b):
        msg = (
            "Could not find the predictions bed file for {}. ({}). Quitting.".
            format(args.name_b, predicted_orfs_b))
        raise FileNotFoundError(msg)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    if not os.path.exists(exons_file):
        msg = "Could not find the exons file ({}). Quitting.".format(
            exons_file)
        raise FileNotFoundError(msg)

    msg = "Reading the exons"
    logger.info(msg)

    exons = bed_utils.read_bed(exons_file)

    msg = "Reading the BF files"
    logger.info(msg)

    bf_df_a = bed_utils.read_bed(bayes_factors_a)
    bf_df_b = bed_utils.read_bed(bayes_factors_b)

    msg = "Reading the predictions files"
    logger.info(msg)

    bed_df_a = bed_utils.read_bed(predicted_orfs_a)
    bed_df_b = bed_utils.read_bed(predicted_orfs_b)

    differential_micropeptide_dfs = []

    # extract micropeptides
    msg = "Extracting micropeptides"
    logger.info(msg)

    m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len
    m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len

    micropeptides_a = bed_df_a[m_micropeptides_a]
    micropeptides_b = bed_df_b[m_micropeptides_b]

    long_orfs_a = bed_df_a[~m_micropeptides_a]
    long_orfs_b = bed_df_b[~m_micropeptides_b]

    msg = "Finding micropeptides in A with no overlap in B"
    logger.info(msg)

    micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a,
                                                        bed_df_b,
                                                        exons=exons)

    micropeptides_a_no_match_b_df = pd.DataFrame()
    micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b)
    micropeptides_a_no_match_b_df['B'] = None
    micropeptides_a_no_match_b_df['kl'] = np.inf
    micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only'

    differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df)

    msg = "Finding micropeptides in B with no overlap in A"
    logger.info(msg)

    micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b,
                                                        bed_df_a,
                                                        exons=exons)

    micropeptides_b_no_match_a_df = pd.DataFrame()
    micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a)
    micropeptides_b_no_match_a_df['A'] = None
    micropeptides_b_no_match_a_df['kl'] = np.inf
    micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only'

    differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df)

    msg = "Finding overlapping micropeptides"
    logger.info(msg)

    micropeptides_a_micropeptides_b_df = get_overlap_df(
        micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df)

    micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b,
                                               'micro_a_long_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_a_long_b_df)

    micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b,
                                               'long_a_micro_b', bf_df_a,
                                               bf_df_b)
    differential_micropeptide_dfs.append(micropeptides_b_long_a_df)

    differential_micropeptides_df = pd.concat(differential_micropeptide_dfs)

    msg = "Adding read count information"
    logger.info(msg)

    res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep],
                                              left_on='A',
                                              right_on='id',
                                              how='left')
    to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_A', axis=1)

    res = res.merge(bf_df_b[args.fields_to_keep],
                    left_on='B',
                    right_on='id',
                    how='left')
    to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep}
    res = res.rename(columns=to_rename)
    res = res.drop('id_B', axis=1)

    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    if not args.do_not_fix_tcons:
        # replace TCONS_ with TCONS
        res['A'] = res['A'].str.replace("TCONS_", "TCONS")
        res['B'] = res['B'].str.replace("TCONS_", "TCONS")

    msg = "Extracting the genes and their biotypes using pyensembl"
    logger.info(msg)

    ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release,
                                       species=args.ensembl_species)
    ensembl_transcript_ids = set(ensembl.transcript_ids())

    biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A',
                                          ensembl, ensembl_transcript_ids)
    biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B',
                                          ensembl, ensembl_transcript_ids)

    biotypes_a = utils.remove_nones(biotypes_a)
    biotypes_b = utils.remove_nones(biotypes_b)

    biotypes_a = pd.DataFrame(biotypes_a)
    biotypes_b = pd.DataFrame(biotypes_b)

    res = res.merge(biotypes_a, on='A', how='left')
    res = res.merge(biotypes_b, on='B', how='left')

    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    # pull annotations from mygene
    gene_info_a = mygene_utils.query_mygene(res['gene_id_A'])
    gene_info_b = mygene_utils.query_mygene(res['gene_id_B'])

    # and add the mygene info
    res = res.merge(gene_info_a,
                    left_on='gene_id_A',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    res = res.merge(gene_info_b,
                    left_on='gene_id_B',
                    right_on='gene_id',
                    how='left')

    to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns}
    to_rename.pop('gene_id')
    res = res.rename(columns=to_rename)
    res = res.drop('gene_id', axis=1)

    msg = "Removing duplicates"
    logger.info(msg)
    id_columns = ['A', 'B']
    res = res.drop_duplicates(subset=id_columns)

    msg = "Adding --id-matches columns"
    logger.info(msg)

    for (id_match_file, name) in zip(args.id_matches, args.id_match_names):
        res = add_id_matches(res, id_match_file, name)

    msg = "Adding --overlaps columns"
    logger.info(msg)

    for (overlap_file, name) in zip(args.overlaps, args.overlap_names):
        res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons)

    msg = "Sorting by in-frame reads"
    logger.info(msg)

    res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0)
    res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0)
    res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B']
    res = res.sort_values('x_1_sum', ascending=False)

    if args.filter:
        msg = "Filtering the micropeptides by read coverage and KL-divergence"
        logger.info(msg)

        x_1_sum_ranks = res['x_1_sum'].rank(method='min',
                                            na_option='top',
                                            ascending=False)
        num_x_1_sum_ranks = x_1_sum_ranks.max()
        max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent
        m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank

        msg = ("Number of micropeptides passing read filter: {}".format(
            sum(m_good_x_1_sum_rank)))
        logger.debug(msg)

        kl_ranks = res['kl'].rank(method='dense',
                                  na_option='top',
                                  ascending=False)
        num_kl_ranks = kl_ranks.max()
        max_good_kl_rank = num_kl_ranks * args.kl_filter_percent
        m_good_kl_rank = kl_ranks <= max_good_kl_rank

        msg = ("Number of micropeptides passing KL filter: {}".format(
            sum(m_good_kl_rank)))
        logger.debug(msg)

        m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank

        msg = ("Number of micropeptides passing both filters: {}".format(
            sum(m_both_filters)))
        logger.debug(msg)

        res = res[m_both_filters]

    msg = "Writing differential micropeptides to disk"
    logger.info(msg)

    if args.append_sheet is None:
        utils.write_df(res, args.out, index=False)
    else:
        sheet_name = "{},{}".format(args.name_a, args.name_b)
        utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
コード例 #9
0
ファイル: cli.py プロジェクト: ctokheim/fusion_pipeline
def main():
    """
    Main function for processing command line options
    """

    parser = argparse.ArgumentParser(
        description='Annotate Gene Fusion (AGFusion)')
    subparsers = parser.add_subparsers(help='AGFusion programs.',
                                       dest="subparser_name")

    annotate_parser = subparsers.add_parser(
        'annotate', help='Annotate and visualize a single fusion.')
    annotate_parser.add_argument('-g5',
                                 '--gene5prime',
                                 type=str,
                                 required=True,
                                 help='5\' gene partner')
    annotate_parser.add_argument('-g3',
                                 '--gene3prime',
                                 type=str,
                                 required=True,
                                 help='3\' gene partner')
    annotate_parser.add_argument(
        '-j5',
        '--junction5prime',
        type=int,
        required=True,
        help='Genomic location of predicted fuins for the 5\' gene partner. ' +
        'The 1-based position that is the last nucleotide included in ' +
        'the fusion before the junction.')
    annotate_parser.add_argument(
        '-j3',
        '--junction3prime',
        type=int,
        required=True,
        help='Genomic location of predicted fuins for the 3\' gene partner. ' +
        'The 1-based position that is the first nucleotide included in ' +
        'the fusion after the junction.')
    add_common_flags(annotate_parser)
    annotate_parser.add_argument(
        '--scale',
        type=int,
        required=False,
        default=-1,
        help='(Optional) Set maximum width (in amino acids) of the ' +
        'figure to rescale the fusion (default: max length of ' +
        'fusion product)')

    # batch file parser

    batch_parser = subparsers.add_parser(
        'batch',
        help='Annotate fusions from an output file from a fusion ' +
        'finding algorithm.')
    batch_parser.add_argument(
        '-f',
        '--file',
        type=str,
        required=True,
        help='Output file from fusion-finding algorithm.')
    batch_parser.add_argument(
        '-a',
        '--algorithm',
        type=str,
        required=True,
        help='The fusion-finding algorithm. Can be one of the following: ' +
        ', '.join(agfusion.parsers.keys()) + '.')
    add_common_flags(batch_parser)

    # download database

    database_parser = subparsers.add_parser(
        'download', help='Download database for a reference genome.')
    database_parser.add_argument(
        '-d',
        '--dir',
        type=str,
        default='',
        help='(Optional) Directory to the database will be downloaded ' +
        'to (defaults to current working directory).')
    database_parser.add_argument(
        '-g',
        '--genome',
        type=str,
        default=None,
        help='Specify the genome shortcut (e.g. hg19). To see all' +
        'available shortcuts run \'agfusion download -a\'. Either ' +
        'specify this or --species and --release.')
    database_parser.add_argument('-s',
                                 '--species',
                                 type=str,
                                 default=None,
                                 help='The species (e.g. homo_sapiens).')
    database_parser.add_argument('-r',
                                 '--release',
                                 type=int,
                                 default=None,
                                 help='The ensembl release (e.g. 87).')
    database_parser.add_argument(
        '-a',
        '--available',
        action='store_true',
        required=False,
        help='List available species and ensembl releases.')

    # build database parser

    build_database_parser = subparsers.add_parser(
        'build', help='Build database for a reference genome.')
    build_database_parser.add_argument(
        '-d',
        '--dir',
        type=str,
        required=True,
        help='Directory to write database file to.')
    build_database_parser.add_argument('-s',
                                       '--species',
                                       type=str,
                                       required=True,
                                       help='The species (e.g. homo_sapiens).')
    build_database_parser.add_argument('-r',
                                       '--release',
                                       type=int,
                                       required=True,
                                       help='The ensembl release (e.g. 87).')
    build_database_parser.add_argument(
        '--pfam',
        type=str,
        required=True,
        help='File containing PFAM ID mappings.')
    build_database_parser.add_argument(
        '--server',
        type=str,
        required=False,
        default='ensembldb.ensembl.org',
        help='(optional) Ensembl server (default ensembldb.ensembl.org)')

    # agfusion version number

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version=agfusion.__version__)
    args = parser.parse_args()

    if args.subparser_name == 'build':
        builddb(args)
        exit()
    elif args.subparser_name == 'download':
        if args.available:
            list_available_databases()
        else:
            downloaddb(args)
        exit()

    # single or batch mode

    if not exists(args.out):
        mkdir(args.out)

    # if user does not specify a sqlite database then use the one provided
    # by the package

    db_file = split(args.database)[1]
    species = db_file.split('.')[1]
    release = db_file.split('.')[2]

    assert species in AVAILABLE_ENSEMBL_SPECIES, 'unsupported species!'

    agfusion_db = agfusion.AGFusionDB(args.database, debug=args.debug)
    agfusion_db.build = species + '_' + str(release)

    # get the pyensembl data

    pyensembl_data = pyensembl.EnsemblRelease(release, species)

    try:
        pyensembl_data.db
    except ValueError:
        agfusion_db.logger.error(
            "Missing pyensembl data. Run pyensembl install --release " +
            "{} --species {}".format(release, species))
        exit()

    # parse the re-coloring and re-naming

    colors = {}
    rename = {}

    if args.rename is not None:
        for i in args.rename:
            pair = i.split(';')

            assert len(pair) == 2, " did not properly specify --rename"

            if pair[0] in rename:
                agfusion_db.logger.warn(
                    "WARNING - you rename {} twice.".format(pair[0]))

            rename[pair[0]] = pair[1]

    if args.recolor is not None:
        for i in args.recolor:
            pair = i.split(';')

            assert len(pair) == 2, " did not properly specify --colors"

            if pair[0] in colors:
                agfusion_db.logger.warn(
                    "You specified colors for {} twice.".format(pair[0]))

            if pair[0] in rename:
                colors[rename[pair[0]]] = pair[1]
            else:
                colors[pair[0]] = pair[1]

    # check image file type is valid

    if args.type not in ['png', 'pdf', 'jpeg']:
        agfusion_db.logger.error(
            "ERROR - provided an incorrect image file type: {}.".format(
                args.type))
        exit()

    if args.subparser_name == 'annotate':
        annotate(gene5prime=args.gene5prime,
                 junction5prime=args.junction5prime,
                 gene3prime=args.gene3prime,
                 junction3prime=args.junction3prime,
                 agfusion_db=agfusion_db,
                 pyensembl_data=pyensembl_data,
                 args=args,
                 outdir=args.out,
                 colors=colors,
                 rename=rename,
                 scale=args.scale)
    elif args.subparser_name == 'batch':
        batch_mode(args, agfusion_db, pyensembl_data, rename, colors)
コード例 #10
0
def get_seq_aminoacid(cfg, din):
    """
    Fetches sequences if mutation format is amino acid 

    :param cfg: configuration dict
    :param din: input data
    :returns dsequences: dataframe with sequences
    """
    import pyensembl
    #import ensembl object that would fetch genes
    # ensembl = pyensembl.EnsemblRelease(release=cfg['genomerelease'])
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])

    din.index = range(len(din))
    dbedp = '{}/dbedflank.bed'.format(cfg['datad'])
    dbed = pd.DataFrame(columns=bed_colns)
    terrpositions = []
    terrnotfound = []
    terrnoncoding = []
    bedrowi = 0
    #             for i in trange(len(din)-1,desc='get positions for bedtools'):
    for i in din.index:
        if din.loc[i, 'transcript: id'] in ensembl.transcript_ids():
            t = ensembl.transcript_by_id(din.loc[i, 'transcript: id'])
            if t.is_protein_coding and t.contains_start_codon and t.contains_stop_codon:
                coding_sequence_positions = tboundaries2positions(t)
                if len(coding_sequence_positions) == len(t.coding_sequence):
                    #TODO     need to check if the seq made from coding_sequence_positions is same as t.coding_seqeunce
                    dcoding = t2pmapper(t, coding_sequence_positions)
                    dcodingmutpos = dcoding.loc[(
                        dcoding['protein index'] == din.loc[
                            i, 'aminoacid: position']), :]
                    codon_positions = dcodingmutpos[
                        'coding sequence positions'].tolist()
                    if len(codon_positions) != 0:
                        dbed.loc[bedrowi, 'chromosome'] = t.contig
                        if cfg['test']:
                            print(din.loc[i, 'transcript: id'],
                                  codon_positions)
                        if t.strand == '+':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[0]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[2]
                        elif t.strand == '-':
                            dbed.loc[bedrowi,
                                     'codon start'] = codon_positions[2]
                            dbed.loc[bedrowi, 'codon end'] = codon_positions[0]
                        dbed.loc[bedrowi, 'start'] = dbed.loc[
                            bedrowi,
                            'codon start'] - 22  #FIXME put flank in the yml
                        dbed.loc[bedrowi, 'end'] = dbed.loc[
                            bedrowi,
                            'codon end'] + 21  #FIXME put flank in the yml

                        dbed.loc[bedrowi, 'reference residue'] = dcodingmutpos[
                            'protein sequence'].tolist()[0]
                        dbed.loc[bedrowi, 'reference codon'] = ''.join(
                            dcodingmutpos['coding sequence'].tolist())
                        dbed.loc[bedrowi, 'strand'] = t.strand
                        dbed.loc[bedrowi, 'id'] = '{}|{}|{}|{}|{}'.format(
                            din.loc[i, 'transcript: id'],
                            dbed.loc[bedrowi, 'chromosome'],
                            dbed.loc[bedrowi, 'strand'],
                            int(dbed.loc[bedrowi, 'start']),
                            int(dbed.loc[bedrowi, 'end']))
                        dbed.loc[bedrowi, 'gene: id'] = t.gene_id
                        dbed.loc[bedrowi, 'gene: name'] = t.gene.name
                        dbed.loc[bedrowi, 'protein: id'] = t.protein_id
                        dbed.loc[bedrowi, 'aminoacid: position'] = din.loc[
                            i, 'aminoacid: position']
                        #         break
                        bedrowi += 1
                    else:
                        terrpositions.append(t.id)
                else:
                    terrpositions.append(t.id)
            else:
                terrnoncoding.append(t.id)
        else:
            terrnotfound.append(din.loc[i, 'transcript: id'])
            if cfg['test']:
                logging.error('not found: {}'.format(
                    din.loc[i, 'transcript: id']))
    if len(dbed) == 0:
        from beditor.lib.global_vars import saveemptytable
        logging.warning('no valid seqeunces found; saving an empty table.')
        saveemptytable(cfg, f"{cfg['dsequencesp']}")
        return None
    dbed = dbed.loc[(dbed.apply(lambda x: x['end'] - x['start'] == 45,
                                axis=1)), :]  #FIXME put flank in the yml

    dbed.loc[:, 'start'] = dbed.loc[:, 'start'].astype(int)
    dbed.loc[:, 'end'] = dbed.loc[:, 'end'].astype(int)

    dbed = dbed.drop_duplicates(subset=bed_colns)
    dbed.loc[:, bed_colns].to_csv(dbedp, sep='\t', header=False, index=False)
    err2tids = {
        'terrpositions': terrpositions,
        'terrnotfound': terrnotfound,
        'terrnoncoding': terrnoncoding,
    }
    if cfg['test']:
        print(err2tids)
    with open(dbedp + '.err.json', 'w') as outfile:
        json.dump(err2tids, outfile)

    bedp = f"{cfg['datad']}/dbedflank.bed"
    fastap = f"{cfg['datad']}/dbedflank.fa"
    cmd = f"{cfg['bedtools']} getfasta -s -name -fi {cfg['genomep']} -bed {bedp} -fo {fastap}"
    runbashcmd(cmd)

    dflankfa = fa2df(fastap, ids2cols=True)
    dflankfa.loc[:, 'sequence'] = dflankfa.loc[:, 'sequence'].apply(
        lambda x: x.upper())
    dflankfa.loc[:,
                 'sequence: length'] = [len(s) for s in dflankfa['sequence']]
    dflankfa.index = [idx.split('(')[0] for idx in dflankfa.index]
    dflankfa.index.name = 'id'
    dseq = set_index(dbed, 'id').join(set_index(dflankfa, 'id'), rsuffix='.1')
    dseq2compatible = {
        'aminoacid: position': 'aminoacid: position',
        'gene: id': 'gene: id',
        'gene: name': 'gene: name',
        'protein: id': 'protein: id',
        'transcript: id': 'seqid',
        'transcript: sequence': 'sequence',
        'aminoacid: wild-type': 'reference residue',
        'codon: wild-type': 'reference codon',
        'contig': 'contig',
        'strand': 'strand',
        'start': 'start',
        'end': 'end',
        'codon start': 'codon start',
        'codon end': 'codon end',
    }
    if 'amino acid mutation' in dseq:
        dseq2compatible['amino acid mutation'] = 'amino acid mutation'
    dseq.to_csv(cfg['dseqtmpp'], sep='\t')

    dseq = dseq[list(dseq2compatible.values())]
    dseq.columns = list(dseq2compatible.keys())
    #             dseq.to_csv('data/dseq.csv')

    logging.info(dseq.columns.tolist())
    logging.info(din.columns.tolist())
    dseq = pd.merge(dseq.reset_index(),
                    din,
                    on=['transcript: id', 'aminoacid: position'])
    logging.info(dseq.columns.tolist())
    set_index(dseq, 'id')
    if 'reverse_mutations' in cfg:
        if cfg['reverse_mutations']:
            from beditor.lib.io_dfs import dfswapcols
            dseq = dfswapcols(dseq,
                              ['aminoacid: wild-type', 'amino acid mutation'])
            dseq['codon: mutation'] = dseq['codon: wild-type'].copy()

    dseq.to_csv(f"{cfg['dsequencesp']}", sep='\t')
    del ensembl
コード例 #11
0
def process_results(comparisons, comparisons_params):

    db = pyensembl.EnsemblRelease('75', 'human')

    for directory in comparisons:
        outdir = directory + '/pval' + str(
            comparisons_params[directory]['pvalue']) + '_padj' + str(
                comparisons_params[directory]['padj']) + '_log2FC' + str(
                    comparisons_params[directory]['log2'])
        print outdir

        try:
            genes = pandas.read_table('./' + outdir + '/' + directory +
                                      '_up.txt',
                                      header=None)
        except:
            continue

        symbols = []
        for ii in genes[0].tolist():
            try:
                symbols.append(db.gene_by_id(ii).gene_name)
            except ValueError:
                symbols.append(ii)
        genes['symbols'] = symbols
        genes.index = genes[0]

        mapping, raw, bed = parse_result_file('./' + outdir + '/' + directory +
                                              '_results_up.txt')
        raw = get_overlapping_genes(raw, mapping, genes)
        raw_up = raw[raw['total_genes_in_region'].astype(int) > 3]
        raw_up['cna'] = ['amplification'] * raw_up.shape[0]

        try:
            genes = pandas.read_table('./' + outdir + '/' + directory +
                                      '_down.txt',
                                      header=None)
        except:
            continue
        symbols = []
        for ii in genes[0].tolist():
            try:
                symbols.append(db.gene_by_id(ii).gene_name)
            except ValueError:
                symbols.append(ii)
        genes['symbols'] = symbols
        genes.index = genes[0]

        mapping, raw, bed = parse_result_file('./' + outdir + '/' + directory +
                                              '_results_down.txt')
        raw = get_overlapping_genes(raw, mapping, genes)
        raw_down = raw[raw['total_genes_in_region'].astype(int) > 3]
        raw_down['cna'] = ['deletion'] * raw_down.shape[0]

        raw = raw_down.append(raw_up)
        raw = raw[[
            'chrom', 'start', 'end', 'pvalue', 'padj', 'cna',
            'num_differentially_expressed_genes', 'total_genes_in_region',
            'differentially_expressed_genes'
        ]]
        raw = raw.sort_values('padj')

        raw.to_excel('./' + outdir + '/' + directory + '_results.xlsx',
                     index=False)
コード例 #12
0
def main():

    args = argparser()

    os.environ['PYENSEMBL_CACHE_DIR'] = args.pyensembl
    ensembl_object = pyensembl.EnsemblRelease(release=75)

#    download_ensembl(args, ensembl_object)

    ## If dbSNP argument is not a file, then it's a column of rsIDs.
    if not os.path.isfile(args.dbSNP):
        args.dbSNP = int(args.dbSNP)

    ## http://colorbrewer2.org/#type=qualitative&scheme=Paired&n=4
    cycle_colors = itertools.cycle(
        (('#a6cee3', '#1f78b4'), ('#b2df8a', '#33a02c')))

    pos_init_chrom = 0
    pos_prev = 0
    fi = fileinput.FileInput(
        files=args.input, openhook=fileinput.hook_compressed)
    l_x = []
    l_y = []
    l_c = []
    l_prob = []
    x_ticks = []
    ## Gene annotations a local maxima.
    annotations = []
    y_max = 0
    d_pos_init_chrom = {}
    for chrom, split_lines in itertools.groupby(
        split_line(fi), operator.itemgetter(args.chrom)):
        print('Looping over chromosome', chrom, file=sys.stderr)
        pos_init_chrom += pos_prev + 1500000  # todo: make argument
        d_pos_init_chrom[chrom] = pos_init_chrom
        colors = next(cycle_colors)
        x_ticks.append((None, None))
        pos = 0
        for l in split_lines:
            af = float(l[args.af])
            if min(af, 1 - af) < args.threshold_maf:
                continue
            pos, pos_prev = int(l[args.pos]), pos
            ## Assert that input is sorted.
            assert pos >= pos_prev, (pos, pos_prev)
            prob = float(l[args.prob])
            ref = l[args.ref]
            alt = l[args.alt]
            x = pos + pos_init_chrom
            try:
                y = -math.log10(prob)
            except:
                ## GEMMA seems to print a p_lrt of 0.000000e+00, when the numbers get too small.
                ## e.g. bilirubin 2:234664586
                assert l[args.prob] == '0.000000e+00'
                y = y+0.01
            y_max = max(y, y_max)
            l_x.append(x)
            l_y.append(y)
            l_prob.append(prob)
            ## Place a chromosome tick halfway through the chromosome basepair range.
            x_ticks[-1] = (pos_init_chrom + pos / 2, chrom)
            if prob > args.threshold_p:
                l_c.append(colors[0])
            else:
                l_c.append(colors[1])
                gene_names = ensembl_object.gene_names_at_locus(chrom, pos)
                gene_ids = ensembl_object.gene_ids_at_locus(chrom, pos)
                protein_ids = ensembl_object.protein_ids_at_locus(chrom, pos)
                transcript_ids = ensembl_object.transcript_ids_at_locus(
                    chrom, pos)
##                print('protein_ids', protein_ids)
##                print('gene_names', gene_names)
##                print('gene_ids', gene_ids)
##                print('transcript_ids', transcript_ids)
####                for gene_id in gene_ids:
####                    locus = ensembl_object.locus_of_gene_id(gene_id)
######                    print(gene_id, locus.start, locus.end, locus.strand)
                if os.path.isfile(args.dbSNP):
                    rsID = parse_dbSNP(args, chrom, pos, ref, alt)
                else:
                    rsID = l[args.dbSNP]
                annotation = {
                    'chrom': chrom,
                    'x': x,
                    'y': y,
                    'prob': prob,
                    'pos': pos,
                    'ref': ref,
                    'alt': alt,
                    'rsID': rsID,
                    'gene_ids': gene_ids,
                    'gene_names': gene_names,
                    'af': af,
                    }
##                print(
##                    prob, af, chrom, pos, rsID, ref, alt,
##                    ','.join(gene_ids), ','.join(gene_names),
##                    sep='\t', file=sys.stdout)

                ## Don't append to the cluster, if it doesn't have an rsID.
                ## Pick something with a lower probability instead then.
                if not rsID:
                    pass
                ## No clusters yet.
                ## Create first cluster.
                elif not annotations:
                    annotations.append(annotation)
                ## Not in the vicinity of previous cluster.
                ## Append new cluster.
                elif (
                    chrom != annotations[-1]['chrom'] or
                    pos - annotations[-1]['pos'] > 1000000):  # todo: make arg!
                    annotations.append(annotation)
                ## In the vicinity of previous cluster.
                ## Probability lower than current local minimum.
                ## Overwrite previous cluster.
                elif prob < annotations[-1]['prob']:
                    annotations[-1] = annotation

    print('annotations', annotations, file=sys.stderr)

    plot_qq(args, l_y, l_prob)
    plt.clf()
    plot_manhattan(
        args, annotations, l_x, l_y, l_c, x_ticks, y_max, d_pos_init_chrom)

    return
コード例 #13
0
###
# This script reads json files from http://amigo.geneontology.org/amigo and format data into tables for publication
###

import os, json

# run if first time: pyensembl install --release 99 --species homo_sapiens
import pyensembl
release = pyensembl.EnsemblRelease()

def formatter(enrichment, g):

    print(enrichment)
    print()

    term = enrichment['term']['label']

    level = 0
    if term != 'UNCLASSIFIED':
        level = enrichment['term']['level']

    background_rank = enrichment['number_in_reference']
    found_rank = enrichment['input_list']['number_in_list']
    expected_rank = enrichment['input_list']['expected']
    fold_enrichment = enrichment['input_list']['fold_enrichment']
    sign = enrichment['input_list']['plus_minus']
    pvalue = enrichment['input_list']['pValue']

    ensembl_ids = enrichment['input_list']['mapped_id_list']['mapped_id']
    if isinstance(ensembl_ids, list) == False:
        ensembl_ids = [ensembl_ids]
コード例 #14
0
#AML
#1/6/19
#PyEnsembl tutorial - https://www.hammerlab.org/2015/02/04/exploring-the-genome-with-ensembl-and-python/
#already set up local copy of release 96

import pyensembl
import pandas as pd
from transcriptGraphing import SpliceVariantPASDiagram, MultiGeneVariantPASDiagram
from Bio import SeqIO
import numpy as np
from scipy.signal import find_peaks

#pulls current release version for the human genome
ensembl = pyensembl.EnsemblRelease(release='96')


#opens human cluster data for given chromosome.  Can optionally filter by PAS type
def openPASClustersForChromosome(name, pasType='All'):
    #opening all the true values from PolyASite2.0
    colnames = [
        "seqName", "start", "end", "clusterID", "avgTPM", "strand",
        "percentSupporting", "protocolsSupporting", "avgTPM2", "type",
        "upstreamClusters"
    ]
    pas_stuff = pd.read_csv('atlas.clusters.hg38.2-0.bed',
                            delimiter='\t',
                            names=colnames,
                            dtype={"seqName": str})
    trueValBoolMask = pas_stuff['seqName'] == name
    currentTrueVals = pas_stuff[trueValBoolMask]  #filtered true vals
    if pasType == "All":
コード例 #15
0
ファイル: makeGTF.py プロジェクト: tgen/bisbee
import pyensembl
import sys, os
import pandas as pd
import utils as bb

event_file = sys.argv[1]
ensemble_release = int(sys.argv[2])

events_table = pd.read_csv(event_file)
ensembl = pyensembl.EnsemblRelease(ensemble_release)
gtf = pd.DataFrame(
    columns=["seqname", "feature", "start", "end", "strand", "attribute"])

for index, row in events_table.iterrows():
    try:
        tid, sep, event_jid = row["effectId"].partition('_')
    except:
        continue
    event_coords = bb.jid_to_coords(event_jid)
    transcript = ensembl.transcript_by_id(tid)
    isoform = bb.get_matching_isoform(transcript, event_coords)
    attribute = 'gene_id ' + transcript.gene_id + '; transcript_id ' + row[
        "effectId"]
    exon_coord = pd.DataFrame(transcript.exon_intervals,
                              columns=["start", "end"])
    new_coord = bb.get_new_coord(isoform, event_coords, exon_coord)
    gtf = gtf.append(
        {
            "seqname": transcript.contig,
            "feature": "transcript",
            "start": new_coord.start.min(),
コード例 #16
0
from os.path import join, expanduser, curdir, abspath
import unittest
import agfusion
from agfusion import utils
import pyensembl
from Bio import SeqIO, Seq, Alphabet

data = pyensembl.EnsemblRelease(84,'mouse')
db = agfusion.AGFusionDB(abspath(join(curdir,'agfusion.mus_musculus.84.db')))
db.build = 'mus_musculus_84'

data_human = pyensembl.EnsemblRelease(75,'human')
db_human = agfusion.AGFusionDB(abspath(join(curdir,'agfusion.homo_sapiens.75.db')))
db_human.build = 'homo_sapiens_75'


class TestSequencePrediction_human(unittest.TestCase):
    def test_1(self):
        """
        test CDS and prortein correct for junction that is on exon boundaries and
        produces an out-of-frame protein.
        """

        #test the dna and protein coding sequences are correct by comparing
        #with manually generally sequences

        fusion = agfusion.Fusion(
            gene5prime="TMEM87B",
            gene5primejunction=112843681,
            gene3prime="MERTK",
            gene3primejunction=112722768,
コード例 #17
0
]
PPI_triples.gene_symbol_2 = [
    str(symbol) for symbol in PPI_triples.gene_symbol_2
]

PPI_edges = df_to_edgelist(PPI_triples.iloc[:, 2:])

# PPI
PP = nx.Graph()
PP.add_edges_from(PPI_edges)
for u, v in PP.edges:
    if u == v:
        PP.remove_edge(u, v)

# ensembl release 77
esb = pyensembl.EnsemblRelease(77)
esb.gene_by_id('ENSG00000148143')

# Selected PPI
y2h_ht14 = pd.read_csv('./data/PPI/HI-II-14_trim.csv', sep='\t', header=None)
y2h_ht14.columns = [
    'uniprot_id_1', 'uniprot_id_2', 'ensembl_anno_1', 'ensembl_anno_2'
]
y2h_ht14['ensembl_gene_1'] = [
    anno.split('|')[-1].split('.')[0].replace('ensembl:', '')
    for anno in y2h_ht14.ensembl_anno_1
]
y2h_ht14['ensembl_gene_2'] = [
    anno.split('|')[-1].split('.')[0].replace('ensembl:', '')
    for anno in y2h_ht14.ensembl_anno_2
]
コード例 #18
0
def convert_mutations():
    chroms = [str(i) for i in range(1, 20)] + ['X', 'Y']

    # human and mouse homologs

    homology = pandas.read_csv("mart_export.txt")
    homology = homology[~pandas.isnull(homology['Mouse gene stable ID'])]
    homologs = dict(
        zip(homology['Mouse gene stable ID'], homology['Gene stable ID']))

    human_db = pyensembl.EnsemblRelease(86, 'human')
    mouse_db = pyensembl.EnsemblRelease(86, 'mouse')

    # alignments

    alignments = pickle.load(open('alignments.p', 'rb'))
    alignment_maps = {}
    for i in alignments.keys():
        alignment_maps[i.split('-')[1]] = i

    mutations = pandas.read_csv(
        "/Users/charlesmurphy/Desktop/Research/0914_hui/results/Mutations/Varscan-paired/somatic.nodbsnp.varscan.filtered.primaryControl.csv"
    )
    hotspots = pandas.read_table(
        '/Users/charlesmurphy/Desktop/Research/data/papers/hotspotMutations/060717.hotspots.txt',
        sep='\t')
    ishotspot = []
    subsitution = []
    human_genes = []

    for i in mutations.index:

        #get the mouse transcript and determine if it had an alignment

        transcript = mutations.loc[i, 'TRANSCRIPT'].split('.')[0]
        if transcript not in alignment_maps:
            ishotspot.append('')
            subsitution.append('')
            human_genes.append('')
            continue

        human_gene = human_db.transcript_by_id(
            alignment_maps[transcript].split('-')[0]).gene.name
        human_genes.append(human_gene)

        AA = mutations.loc[i, 'AMINO ACID CHANGE']
        bp_change = mutations.loc[i, 'BASE PAIR CHANGE']
        effect = mutations.loc[i, 'EFFECT']

        if pandas.isnull(AA) and effect.find('splice') != -1:

            #only continute if it is an actualy amino acid change
            #splice site variants don't have any entry for AA, but I manually
            #checked the list in OncoKB and saw no splice site variants

            ishotspot.append('')
            subsitution.append(bp_change + ' splice variant')
            continue
        elif pandas.isnull(AA) and effect.find(
                '5_prime_UTR_premature_start_codon_gain_variant') != -1:
            ishotspot.append('')
            subsitution.append(
                bp_change + ' 5_prime_UTR_premature_start_codon_gain_variant')
            continue
        elif AA.find('*') != -1 or AA.find('fs') != -1 or AA == 'p.Met1?':
            ishotspot.append('')
            subsitution.append(AA + ' truncating')
            continue
        elif pandas.isnull(AA):
            ishotspot.append('')
            subsitution.append('')
            print 'not sure what variant'
            import pdb
            pdb.set_trace()
            continue

        # get the amino acid position and letter change

        AA_pos = int(re.findall('[0-9]+', AA)[0])
        AA = AA.replace('p.', '')
        AA = AA.replace(str(AA_pos), '')
        AA = seq1(AA)

        # determine if there was an alignment

        alignment = alignments[alignment_maps[transcript]]

        if alignment is None:
            ishotspot.append('')
            subsitution.append(AA[0] + str(human_pos) + AA[1])
            continue

        # if there was an alignment, get the amino acid position in the human gene

        mouse_pos = 0
        mouse_index = 0
        mouse_aa = ''
        for aa in alignment[1]:
            mouse_index += 1
            if aa == 'X':
                continue
            if aa != '-':
                mouse_pos += 1
            if mouse_pos == AA_pos:
                mouse_aa = alignment[1][mouse_index - 1]
                break

        if mouse_aa != AA[0]:
            print 'boo'
            import pdb
            pdb.set_trace()

        human_pos = 0
        human_index = 0
        human_aa = ''
        for aa in alignment[0]:
            human_index += 1
            if aa == 'X':
                continue
            if aa != '-':
                human_pos += 1
            if human_index == mouse_index:
                human_aa = alignment[0][human_index - 1]
                break

        subsitution.append(AA[0] + str(human_pos) + AA[1])

        # determine if it is in a hotspot

        hotspots_tmp = hotspots[hotspots['Gene'] == human_gene]
        if hotspots_tmp.shape[0] > 0:
            positions = map(lambda x: int(re.findall('[0-9]+', x)[0]),
                            hotspots_tmp['Residue'].tolist())
            positions = [human_pos == i for i in positions]
            if sum(positions) > 0:
                hotspots_tmp = hotspots_tmp[positions]['Residue'].tolist()[0]
                ishotspot.append(human_gene + ' (' + hotspots_tmp + ')')
            else:
                ishotspot.append('')
        else:
            ishotspot.append('')

    mutations_per_sample = {}

    samples = mutations.columns.tolist()[10:]
    for sample in samples:
        mutations_per_sample[sample] = {}
        #mutations_per_sample[sample]['genes'] = []
        mutations_per_sample[sample]['truncating'] = []
        mutations_per_sample[sample]['premature'] = []
        mutations_per_sample[sample]['splice'] = []
        mutations_per_sample[sample]['missense'] = []
        mutations_per_sample[sample]['hotspot'] = []
    for i in range(0, mutations.shape[0]):
        for ss in samples:
            if mutations.loc[i, ss] != '-':
                #mutations_per_sample[ss]['genes'].append(human_genes[i])
                if ishotspot[i] != '':
                    mutations_per_sample[ss]['hotspot'].append(ishotspot[i])
                if subsitution[i].find('truncating') != -1:
                    mutations_per_sample[ss]['truncating'].append(
                        human_genes[i] + ' (' + subsitution[i] + ')')
                elif subsitution[i].find('splice') != -1:
                    mutations_per_sample[ss]['splice'].append(human_genes[i] +
                                                              ' (' +
                                                              subsitution[i] +
                                                              ')')
                elif subsitution[i].find(
                        '5_prime_UTR_premature_start_codon_gain_variant'
                ) != -1:
                    mutations_per_sample[ss]['premature'].append(
                        human_genes[i] + ' (' + subsitution[i] + ')')
                elif subsitution[i] != '':
                    mutations_per_sample[ss]['missense'].append(
                        human_genes[i] + '-' + subsitution[i])

    return mutations_per_sample
コード例 #19
0
from os.path import join, expanduser, curdir, abspath
import unittest
import agfusion
from agfusion import utils
import pyensembl
from Bio import SeqIO

data = pyensembl.EnsemblRelease(84, 'mouse')
db = agfusion.AGFusionDB(abspath(join(curdir, 'agfusion.mus_musculus.84.db')))
db.build = 'mus_musculus_84'


class TestSequencePrediction(unittest.TestCase):
    def test_1(self):
        """
        test CDS and cDNA correct for junction that is on exon boundaries and
        produces an in-frame protein.
        """

        #test the dna and protein coding sequences are correct by comparing
        #with manually generally sequences

        fusion = agfusion.Fusion(gene5prime="ENSMUSG00000022770",
                                 gene5primejunction=31684294,
                                 gene3prime="ENSMUSG00000002413",
                                 gene3primejunction=39648486,
                                 db=db,
                                 pyensembl_data=data,
                                 protein_databases=['pfam', 'tmhmm'],
                                 noncanonical=True)
コード例 #20
0
        self._draw_main_body()

        self.ax.axis('off')
        self.ax.set_xlim(0, 1)
        self.ax.set_ylim(0, 1)


peptides = [[60, 24], [84, 9], [103, 11], [119, 11], [171, 9], [298, 13],
            [445, 20], [528, 8], [620, 11], [631, 9], [649, 11], [660, 8],
            [668, 10], [687, 10], [740, 16], [770, 8], [2330, 8], [2476, 13]]

agfusion_db = AGFusionDB(
    '/Users/charlesmurphy/Desktop/Research/0914_hui/results/Fusions/plots/agfusion.mus_musculus.84.db',
    debug=False)
agfusion_db.build = 'mus_musculus' + '_' + str(84)
pyensembl_data = pyensembl.EnsemblRelease(84, 'mus_musculus')

fusion = Fusion(gene5prime=['ENSMUSG00000030849'],
                gene5primejunction=130167703,
                gene3prime=['ENSMUSG00000055322'],
                gene3primejunction=74016186,
                db=agfusion_db,
                pyensembl_data=pyensembl_data,
                protein_databases=['pfam', 'tmhmm'],
                noncanonical=False)

pplot = PlotFusionProtein(
    filename='FGFR2-TNS1.png',
    width=10,
    height=3,
    dpi=90,
コード例 #21
0
def get_genomes(cfg):
    """
    Installs genomes
    
    :param cfg: configuration dict
    """

    runbashcmd(
        f"pyensembl install --reference-name {cfg['genomeassembly']} --release {cfg['genomerelease']} --species {cfg['host']}"
    )

    import pyensembl
    ensembl = pyensembl.EnsemblRelease(
        species=pyensembl.species.Species.register(latin_name=cfg['host'],
                                                   synonyms=[cfg['host']],
                                                   reference_assemblies={
                                                       cfg['genomeassembly']:
                                                       (cfg['genomerelease'],
                                                        cfg['genomerelease']),
                                                   }),
        release=cfg['genomerelease'])
    contig_mito = ['MTDNA', 'MITO', 'MT']
    contigs = [
        c for c in ensembl.contigs()
        if ((not '.' in c) and (len(c) < 5) and (c not in contig_mito))
    ]
    if len(contigs) == 0:
        logging.error('no contigs identified by pyensembl; aborting')
        sys.exit(0)
    logging.info(f"{len(contigs)} contigs/chromosomes in the genome")
    logging.info(contigs)
    # raw genome next
    if 'human' in cfg['host'].lower():
        cfg['host'] = 'homo_sapiens'
    if 'yeast' in cfg['host'].lower():
        cfg['host'] = 'saccharomyces_cerevisiae'
    host_ = "_".join(s for s in cfg['host'].split('_')).capitalize()
    ensembl_fastad = 'pub/release-{}/fasta/{}/dna/'.format(
        cfg['genomerelease'], cfg['host'])
    genome_fastad = '{}/{}'.format(dirname(realpath(__file__)), ensembl_fastad)
    cfg['genomep'] = '{}/genome.fa'.format(genome_fastad)
    if not exists(cfg['genomep']):
        logging.error('not found: {}'.format(cfg['genomep']))
        if not '/test_beditor/' in cfg['cfgp']:
            ifdlref = input(
                "Download genome at {}?[Y/n]: ".format(genome_fastad))
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            for contig in contigs:
                if 'GRCh37' in cfg['genomeassembly']:
                    #Homo_sapiens.GRCh37.75.dna_sm.chromosome.1.fa.gz
                    fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.{cfg['genomerelease']}.dna_sm.chromosome.{contig}.fa.gz"
                else:
                    fn = f"{cfg['host'].capitalize()}.{cfg['genomeassembly']}.dna_sm.chromosome.{contig}.fa.gz"
                fp = '{}/{}'.format(ensembl_fastad, fn)
                if not exists(fp):
                    cmd = 'wget -q -x -nH ftp://ftp.ensembl.org/{} -P {}'.format(
                        fp, dirname(realpath(__file__)))
                    runbashcmd(cmd, test=cfg['test'])
#                 break
# make the fa ready
            if not exists(cfg['genomep']):
                cmd = 'gunzip {}*.fa.gz;cat {}/*.fa > {}/genome.fa;'.format(
                    genome_fastad, genome_fastad, genome_fastad)
                runbashcmd(cmd, test=cfg['test'])
        else:
            logging.error('abort')
            sys.exit(1)
    if not exists(cfg['genomep'] + '.bwt'):
        cmd = '{} index {}'.format(cfg['bwa'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('bwa index is present')
    if not exists(cfg['genomep'] + '.fai'):
        cmd = '{} faidx {}'.format(cfg['samtools'], cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('samtools index is present')
    if not exists(cfg['genomep'] + '.sizes'):
        cmd = 'cut -f1,2 {}.fai > {}.sizes'.format(cfg['genomep'],
                                                   cfg['genomep'])
        runbashcmd(cmd, test=cfg['test'])
    else:
        logging.info('sizes of contigs are present')

    ensembl_gff3d = 'pub/release-{}/gff3/{}/'.format(cfg['genomerelease'],
                                                     cfg['host'])
    genome_gff3d = '{}/{}'.format(dirname(realpath(__file__)), ensembl_gff3d)
    cfg['genomegffp'] = '{}/genome.gff3'.format(genome_gff3d)
    if not exists(cfg['genomegffp']):
        logging.error('not found: {}'.format(cfg['genomegffp']))
        if not '/test_beditor/' in cfg['cfgp']:
            ifdlref = input("Download genome annotations at {}?[Y/n]: ".format(
                genome_gff3d))
        else:
            ifdlref = 'Y'
        if ifdlref == 'Y':
            # #FIXME download contigs and cat and get index, sizes
            fn = '{}.{}.{}.gff3.gz'.format(cfg['host'].capitalize(),
                                           cfg['genomeassembly'],
                                           cfg['genomerelease'])
            fp = '{}/{}'.format(ensembl_gff3d, fn)
            if not exists(fp):
                cmd = 'wget -x -nH ftp://ftp.ensembl.org/{} -P {}'.format(
                    fp, dirname(realpath(__file__)))
                runbashcmd(cmd, test=cfg['test'])
                # move to genome.gff3
                cmd = 'cp {}/{} {}'.format(genome_gff3d, fn, cfg['genomegffp'])
                runbashcmd(cmd, test=cfg['test'])

        else:
            logging.error('abort')
            sys.exit(1)
    logging.info('genomes are installed!')
    return cfg