Exemple #1
0
def get_go_terms_descendants(biomart_fpath, gene2go_fpath, gene_count_fpath, obo_fpath, ev_codes=None):

    entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath)

    # taxids=[9606] means select only human.
    if ev_codes:
        go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True, evidence_set=ev_codes)
    else:
        go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True)
    print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human)))
    srchhelp = GoSearch(obo_fpath, go2items=go_to_entrez_ids_human)

    # Get the GO terms
    gene_cnt_file = open(gene_count_fpath)
    GO_terms = []
    atLine = 0
    skipLines = 2
    for line in gene_cnt_file:
        if atLine < skipLines:
            atLine += 1
            continue
        GO_id = line.split('\t')[0]
        term = GOterm(GO_id)
        term.add_descendants(srchhelp)

        for id in [GO_id] + term.descendants_ids:
            entrez_ids = go_to_entrez_ids_human[id]
            for ent_id in entrez_ids:
                if str(ent_id) in entrez_to_ensembl:
                    ens_id = entrez_to_ensembl[str(ent_id)]
                    term.genes.add(ens_id)
        GO_terms.append(term)

    return GO_terms
Exemple #2
0
def fetch_go_hierarcy(go_folder, ev_exclude):
    obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    if not os.path.exists(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        wget.download(constants.GO_OBO_URL,
                      os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    print("Downloading gene-GO associations")
    association_file_location = os.path.join(
        constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)
    # if not os.path.exists(association_file_location):
    #     wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL,
    #                   os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME))

    if not os.path.exists(
            os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)):
        if not os.path.exists(
                os.path.join(go_folder,
                             constants.GO_ASSOCIATION_FILE_NAME + ".gz")):
            wget.download(
                constants.GO_ASSOCIATION_GENE2GEO_URL,
                os.path.join(
                    constants.GO_DIR,
                    os.path.split(constants.GO_ASSOCIATION_GENE2GEO_URL)[1]))
        with gzip.open(
                os.path.join(
                    go_folder,
                    os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)),
                'rb') as f_in:
            with open(
                    os.path.join(go_folder,
                                 constants.GO_ASSOCIATION_FILE_NAME),
                    'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    print("Loading gene-GO associations")

    go2geneids = read_ncbi_gene2go(association_file_location,
                                   taxids=[9606],
                                   go2geneids=True,
                                   ev_exclude=ev_exclude)
    geneids2go = read_ncbi_gene2go(association_file_location,
                                   taxids=[9606],
                                   ev_exclude=ev_exclude)

    ## backward compatibility to goatools python 2.7##
    # all_go_ids=set().union(*list(geneids2go.values()))
    # for cur_id in all_go_ids:
    #     go2geneids[cur_id]=set()
    ############################

    return (go2geneids, geneids2go)
    def __init__(self, taxid, fin_gene2go, fin_gobasic):
        _fin = os.path.join(REPO, fin_gene2go)
        dnld_ncbi_gene_file(_fin, loading_bar=None)
        self.gene2go = read_ncbi_gene2go(_fin, [taxid])

        _fin_obo = os.path.join(REPO, fin_gobasic)
        self.godag = get_godag(_fin_obo, loading_bar=None)
    def __init__(self,
                 tax_id=9606,
                 logger=None,
                 force_update=False,
                 go_dir=DEFAULT_GO_DIR,
                 bg_genes=None):
        # gene_converter can be used to enable automatic gene conversion
        self.gene_converter = None
        self.logger = logger or log.get_console_logger(self.__class__.__name__)
        self.tax_id = tax_id
        if not os.path.isdir(go_dir):
            self.logger.warn("Creating master GO directory at %s.", go_dir)
            os.makedirs(go_dir)
        else:
            self.logger.info("Using existing GO directory at %s.", go_dir)
        self.base_dir = go_dir

        # get filenames and parse both GAF and OBO
        self.obo_fn = self.check_and_get_obo(force_update=force_update)
        self.gaf_fn = self.check_and_get_gaf(force_update=force_update)
        self.obo = obo_parser.GODag(self.obo_fn)

        self.gaf = associations.read_ncbi_gene2go(self.gaf_fn,
                                                  taxids=[self.tax_id])
        self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf)))

        self.bg_genes = bg_genes
        if self.bg_genes is not None:
            self.set_bg_genes(bg_genes)
Exemple #5
0
    def __init__(self, dir, params):
        """
        """
        super().__init__(dir, params)
        
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)
        
        logging.info("Loading enrichment study...")
        geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606])
        obodag = GODag("data/go/go-basic.obo")
        self.go_study = GOEnrichmentStudy(self.network.get_names(),
                                          geneid2go,
                                          obodag, 
                                          propagate_counts = True,
                                          alpha = 0.05,
                                          methods = ['fdr_bh'])
Exemple #6
0
    def __init__(self, taxid, fin_gene2go, fin_gobasic):
        _fin = os.path.join(REPO, fin_gene2go)
        dnld_ncbi_gene_file(_fin, loading_bar=None)
        self.gene2go = read_ncbi_gene2go(_fin, [taxid])

        _fin_obo = os.path.join(REPO, fin_gobasic)
        self.godag = get_godag(_fin_obo, loading_bar=None)
Exemple #7
0
def check_group_enrichment(tested_gene_file_name, total_gene_file_name):
    total_gene_list = load_gene_list(total_gene_file_name)
    tested_gene = load_gene_list(tested_gene_file_name)

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in:
            with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
                          assoc, obo_dag, methods=["bonferroni", "fdr_bh"])
    g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
                  cur.p_fdr_bh <= 0.05]
    if len(GO_results) > 0:
        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results)
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    output_rows = [("\r\n".join(e2g_convertor(tested_gene)),  "\r\n".join(go_ns),
                        "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
                        "\r\n".join(map(str, FDRs)))]
    print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
Exemple #8
0
def _get_id2gos(file_assc, taxids, log):
    """Return associations."""
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    fin = os.path.join(REPO, file_assc)
    dnld_ncbi_gene_file(fin, loading_bar=None)
    id2gos = read_ncbi_gene2go(fin, taxids, taxid2asscs=taxid2asscs)
    log.write("  {N} items found in gene2go from NCBI's ftp server\n".format(N=len(id2gos)))
    return taxid2asscs
Exemple #9
0
def fetch_go_hierarcy():
    obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    print "Downloading gene-GO associations"
    association_file_location = os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)
    if not os.path.exists(association_file_location):
        wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL,
                      os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME))

    print "Loading gene-GO associations"
    # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed?
    go2geneids = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True)
    geneids2go = read_ncbi_gene2go(association_file_location, taxids=[9606])

    return (go2geneids, geneids2go)
def test_anno_read():
    """Test reading an NCBI gene2go annotation file."""
    fin_anno = os.path.join(REPO, 'gene2go')
    _dnld_anno(fin_anno)
    #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None)

    print('\nTEST STORING ONLY ONE SPECIES')
    obj = Gene2GoReader(fin_anno)
    assert len(obj.taxid2asscs) == 1
    obj.prt_summary_anno2ev()

    print('\nTEST STORING ALL SPECIES')
    obj = Gene2GoReader(fin_anno, taxids=True)
    assert len(obj.taxid2asscs) > 1, '**EXPECTED MORE: len(taxid2asscs) == {N}'.format(
        N=len(obj.taxid2asscs))
    obj.prt_summary_anno2ev()

    print('\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES')
    print("\nTEST read_ncbi_gene2go_old: [9606]")
    old_g2go_hsa = read_ncbi_gene2go_old(fin_anno, [9606])
    assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, [9606])
    print("\nTEST read_ncbi_gene2go_old: 9606")
    assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, 9606)
    print("\nTEST read_ncbi_gene2go_old: None")
    assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, None)

    print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES')
    go2geneids = True
    print("\nTEST read_ncbi_gene2go_old: 9606 go2geneids=True")
    old_go2gs_hsa = read_ncbi_gene2go_old(fin_anno, [9606], go2geneids=go2geneids)
    new_go2gs_hsa = read_ncbi_gene2go(fin_anno, 9606, go2geneids=go2geneids)
    print('OLD:', next(iter(old_go2gs_hsa.items())))
    print('NEW:', next(iter(new_go2gs_hsa.items())))
    assert old_go2gs_hsa == new_go2gs_hsa

    print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES')
    evcodes = set(['ISO', 'IKR'])
    print("\nTEST read_ncbi_gene2go_old: 9606 evcodes=True")
    old_gene2gos_evc = read_ncbi_gene2go_old(fin_anno, [9606], evidence_set=evcodes)
    new_gene2gos_evc = read_ncbi_gene2go(fin_anno, 9606, evidence_set=evcodes)
    print('OLD:', next(iter(old_gene2gos_evc.items())))
    print('NEW:', next(iter(new_gene2gos_evc.items())))
    assert old_gene2gos_evc == new_gene2gos_evc
Exemple #11
0
def fetch_go_hierarcy():

    obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    if not os.path.exists(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        wget.download(constants.GO_OBO_URL,
                      os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    go = obo_parser.GODag(obo_file_location,
                          optional_attrs=['relationship'])  # also use

    print "Downloading gene-GO associations"
    association_file_location = os.path.join(
        constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)
    if not os.path.exists(association_file_location):
        wget.download(
            constants.GO_ASSOCIATION_GENE2GEO_URL,
            os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME))

    print "Loading gene-GO associations"
    # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed?
    go2geneids_human = read_ncbi_gene2go(association_file_location,
                                         taxids=[9606],
                                         go2geneids=True)

    print "Writing out GO child-parent links"
    if not os.path.exists(constants.OUTPUT_GLOBAL_DIR):
        os.makedirs(constants.OUTPUT_GLOBAL_DIR)

    out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE,
                                             time.time())
    genes = []
    isa = []
    relship = []
    with open(os.path.join(constants.OUTPUT_GLOBAL_DIR, out_fname), 'w') as o:
        for goid in go2geneids_human.keys():
            if not go.has_key(goid):
                print "GO obo file does not contain {}".format(goid)
                continue
            entry = go[goid]
            for gene in go2geneids_human[entry.id]:
                genes.append((str(gene), entry.id))
                o.write("{}\t{}\t{}\n".format("genes", *genes[-1]))
            children = entry.children
            for c in children:
                isa.append((c.id, entry.id))
                o.write("{}\t{}\t{}\n".format("is a", *isa[-1]))
            rels = entry.relationship_rev
            for rtype in rels.keys():
                rs = rels[rtype]
                for r in rs:
                    relship.append((rtype, r.id, entry.id))
                    o.write("{}\t{}\t{}\n".format(rtype, *relship[-1]))

    return (genes, isa, relship)
Exemple #12
0
    def load_ontologies_and_associations(self):
        print "---LOADING ONTOLOGIES AND ASSOCIATIONS---"
        # Check if files exist and download if not
        obo_fname = download_go_basic_obo()
        gene2go = download_ncbi_associations()

        # Load ontologies and associations
        obodag = GODag(obo_fname)
        geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606])
        print "{N:,} annotated human genes".format(N=len(geneid2gos_human))

        return obodag, geneid2gos_human
Exemple #13
0
def test_all():

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    """Run numerous tests for various reports."""
    dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo")

    godag = GODag(dag_fin)
    gosubdag = GoSubDag(godag.keys(), godag)

    out = sys.stdout
    write_hier_all(gosubdag, out)
def get_ensembl_ids(go_process_id, biomart_fpath):

    entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath)

    gene2go = download_ncbi_associations()
    # taxids=[9606] means select only human.
    # TODO: ask Marinka if we should use EXP code for evidence!!
    go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True)
    """, evidence_set='EXP'"""

    entrez_ids = go_to_entrez_ids_human[GO_PROCESS_ID]
    ensembl_ids = []
    for ent_id in entrez_ids:
        ensembl_ids.append(entrez_to_ensembl[str(ent_id)])

    print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human)))
    return ensembl_ids
Exemple #15
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    paths = utils.read_paths(args.paths_file)

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    termcounts = TermCounts(go, gene2go)

    if args.namespace is not None:
        if args.namespace == 'cc':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'cellular_component'
            }
        elif args.namespace == 'mf':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'molecular_function'
            }
        elif args.namespace == 'bp':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'biological_process'
            }
        else:
            raise ValueError('namespace can be only cc, mf or bp')

    wrapped = [[path, go, gene2go, termcounts] for path in paths]
    if args.n_cores > 1:
        sims = list(p_map(wrap, wrapped))
    else:
        sims = list(map(wrap, tqdm(wrapped)))

    utils.create_dir_if_not_exist(dirname(args.out_sims_file))
    np.savetxt(args.out_sims_file, sims)
Exemple #16
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 
        
        logging.info("Loading enrichment study...")
        obodag = GODag(self.params["go_path"])
        geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606])
        self.enrichment_study = GOEnrichmentStudy(self.network.get_names(),
                                                  geneid2go,
                                                  obodag,
                                                  log=None,
                                                  **self.params["enrichment_params"])

        logging.info("Loading predictions...")
        self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), 
                                                  index_col=0) 
                                for name, preds in self.params["method_to_preds"].items()}
        
        outputs_path = os.path.join(self.dir, "outputs.pkl")
        if os.path.exists(outputs_path):
            logging.info("Loading outputs...")
            with open(outputs_path, 'rb') as f:
                self.outputs = pickle.load(f)
        else:
            self.outputs = {}
Exemple #17
0
def get_ensembl_ids(go_process_id, biomart_fpath, ev_codes=None):

    entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath)

    gene2go = 'data/gene2go.txt' # If file doesn't exist, then replace this line with gene2go = download_ncbi_associations()

    # taxids=[9606] means select only human.
    go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True)
    print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human)))

    entrez_ids = go_to_entrez_ids_human[go_process_id]
    print '# of Entrez IDs associated with ', go_process_id, ' = ', len(entrez_ids)
    ensembl_ids = []
    for ent_id in entrez_ids:
        if str(ent_id) in entrez_to_ensembl:
            ensembl_ids.append(entrez_to_ensembl[str(ent_id)])

    print '# of Ensembl IDs associated with ', go_process_id, ' = ', len(ensembl_ids)
    return ensembl_ids
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
Exemple #19
0
def get_go_terms(biomart_fpath, gene2go_fpath, gene_count_fpath, top=1):
    """

    :param biomart_fpath:
    :param gene2go_fpath:
    :param gene_count_fpath: Path to file containing number of genes for each
    GO term contained in the supplementary file
    :param top:
    :return:
    """

    entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath)

    # taxids=[9606] means select only human.
    go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True)
    print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human)))

    # Get the |top| GO terms with the most gene annotations
    gene_cnt_file = open(gene_count_fpath)
    top_GO_ids = []
    atLine = 0
    skipLines = 1
    for line in gene_cnt_file:
        if atLine < skipLines:
            atLine += 1
            continue
        elif atLine > top:
            break
        atLine += 1
        GO_id = line.split('\t')[0]
        entrez_ids = go_to_entrez_ids_human[GO_id]
        #print '# of Entrez IDs associated with ', GO_id, ' = ', len(entrez_ids)
        ensembl_ids = []
        for ent_id in entrez_ids:
            if str(ent_id) in entrez_to_ensembl:
                ensembl_ids.append(entrez_to_ensembl[str(ent_id)])
        top_GO_ids.append((GO_id, ensembl_ids))
        #print '# of Ensembl IDs associated with ', GO_id, ' = ', len(ensembl_ids)

    return top_GO_ids
Exemple #20
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids,
                                gene2go,
                                godag,
                                methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
Exemple #21
0
def load_go_annotations(proteins,
                        levels=None,
                        obodag_path="data/go/go-basic.obo",
                        entrez_to_go_path="data/go/gene2go.txt"):
    """
    args:
    @proteins    (iterable)   proteins to get annotations for
    @levels  (list(int)) the levels of the ontology
    @obodag     (str)   path obo file
    @entrez_to_go_path (str) path to mapping from entrez ids to go doids

    return:
    @term_to_proteins (dict) map from term
    """
    obodag = GODag(obodag_path)
    entrez_to_go = read_ncbi_gene2go(entrez_to_go_path, taxids=[9606])

    def get_annotations(protein, levels):
        """
        """
        terms = set()
        doids = entrez_to_go[protein]
        for doid in doids:
            for parent in obodag[doid].get_all_parents():
                if levels is None or obodag[parent].level in levels:
                    terms.add(obodag[parent].name)

        return terms

    term_to_proteins = defaultdict(set)
    for protein in proteins:
        terms = get_annotations(protein, levels)
        for term in terms:
            term_to_proteins[term].add(protein)

    return term_to_proteins
                get_sps, destnodes_sample, n_jobs=args.N_cores), [])

    logging.info('Num of all paths: {}'.format(len(all_paths)))

    fc_paths = []
    for i in trange(len(all_paths)):
        fullpath = all_paths[i]
        if len(fullpath) > 2:
            path = all_paths[i][1:-1]
            if np.all([node in fcnodes for node in path]):
                fc_paths.append(fullpath)

    logging.info('Num of FC paths: {}'.format(len(fc_paths)))

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    termcounts = TermCounts(go, gene2go)

    def get_sim(genes_pair):
        # sim_measure = lin_sim
        i, j = genes_pair[0], genes_pair[1]
        i_go = [goterm for goterm in gene2go[i] if goterm in go]
        j_go = [goterm for goterm in gene2go[j] if goterm in go]
        sims = []
        for i_go_term in i_go:

            def wrap(j_go_term):
                return resnik_sim(i_go_term, j_go_term, go, termcounts)

            simlist = [sim for sim in map(wrap, j_go) if sim is not None]
            if len(simlist):
Exemple #23
0
def find_clusters_and_gene_enrichment(tested_gene_list_file_name,
                                      total_gene_list_file_name,
                                      gene_expression_file_name,
                                      phenotype_file_name,
                                      gene_filter_file_name=None,
                                      tested_gene_list_path=None,
                                      total_gene_list_path=None,
                                      gene_expression_path=None,
                                      phenotype_path=None,
                                      gene_filter_file_path=None,
                                      var_th_index=None,
                                      start_k=2,
                                      end_k=6,
                                      calc_go=True,
                                      enrichment_list_file_names=None,
                                      meta_groups=None,
                                      filter_expression=None,
                                      cluster_algorithm=None):
    # fetch gene expression by gene_id, divided by tumor type
    gene_sets = []
    expression_sets = []
    averaged_expression_sets = []
    tested_gene_expression = load_gene_expression_profile_by_genes(
        tested_gene_list_file_name, gene_expression_file_name,
        gene_filter_file_name, tested_gene_list_path, gene_expression_path,
        gene_filter_file_path)
    tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers(
        tested_gene_expression)

    if filter_expression is not None:
        filtered_patients = [
            y for x in divided_patient_ids_by_label(phenotype_file_name,
                                                    groups=filter_expression)
            for y in x
        ]
        print "number of filtered patients from phenotypes: {}".format(
            len(filtered_patients))
    else:
        print "no filter applied"
        filtered_patients = tested_gene_expression_headers_columns

    tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients(
        filtered_patients, tested_gene_expression_headers_columns,
        tested_gene_expression)
    if np.shape(tested_gene_expression)[1] == 1:
        print "no expressions were found after filtering by labels {}. skipping...".format(
            filter_expression)
        return None

    total_gene_list = load_gene_list(total_gene_list_file_name)
    tested_gene_list = load_gene_list(tested_gene_list_file_name)
    row_var = np.var(tested_gene_expression, axis=1)
    row_var_sorted = np.sort(row_var)[::-1]

    labels_assignment_patients = None
    if meta_groups is not None:
        print "clustering patients by groups"
        labels_assignment_patients = labels_assignments(
            meta_groups, phenotype_file_name,
            tested_gene_expression_headers_columns)

    enrichment_lists = []
    if enrichment_list_file_names is not None:
        for cur in enrichment_list_file_names:
            enrichment_lists.append(load_gene_list(cur))

    if var_th_index is None:
        var_th_index = len(row_var_sorted) - 1
    row_var_th = row_var_sorted[var_th_index]
    row_var_masked_indices = np.where(row_var_th > row_var)[0]
    gene_expression_top_var = np.delete(tested_gene_expression,
                                        row_var_masked_indices,
                                        axis=0)
    gene_expression_top_var_header_rows = np.delete(
        tested_gene_expression_headers_rows, row_var_masked_indices, axis=0)
    gene_expression_top_var_header_columns = tested_gene_expression_headers_columns

    clfs_results = {}
    output_rows = []
    if calc_go:
        if not os.path.exists(
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
            wget.download(
                constants.GO_OBO_URL,
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))
        # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')):
        #     wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf'))
        obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

        assoc = read_ncbi_gene2go(os.path.join(
            constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)
        g = GOEnrichmentStudy(
            [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
            assoc,
            obo_dag,
            methods=["bonferroni", "fdr_bh"])
        g_res = g.run_study([
            int(cur) for cur in ensembl2entrez_convertor(
                gene_expression_top_var_header_rows)
        ])
        GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected,
                       cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05]
        print GO_results

    if cluster_algorithm == "kmeans":

        for n_clusters in range(start_k, end_k + 1):
            clfs_results[n_clusters] = []
            centres, km_clf, dist = kmeanssample(X=gene_expression_top_var,
                                                 k=n_clusters,
                                                 metric="euclidean")
            for i in range(n_clusters):

                ranks = []
                for j in range(n_clusters):
                    ranks.append(
                        np.average(
                            np.delete(gene_expression_top_var,
                                      np.where(km_clf != j)[0],
                                      axis=0)))
                ranks = rankdata(ranks)
                cluster_labels = np.array(km_clf)
                for j in range(n_clusters):
                    cluster_labels[np.where(km_clf == ranks[j] - 1)] = j
                labels_assignment = [cluster_labels + 1]

                cluster_indices = np.where(km_clf != i)[0]
                gene_expression_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                gene_headers_row_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                clfs_results[n_clusters].append(
                    (gene_headers_row_cluster, gene_headers_row_cluster))
                desc = "k={} clustering cluster {} has {} genes".format(
                    n_clusters, i, len(gene_expression_cluster))
                gene_list = ",".join(gene_headers_row_cluster)
                url = check_enrichment(gene_list)

                go_terms = []
                uncorrectd_pvals = []
                FDRs = []
                go_names = []
                go_ns = []
                if calc_go:
                    g_res = g.run_study([
                        int(cur) for cur in ensembl2entrez_convertor(
                            gene_headers_row_cluster)
                    ])
                    GO_results = [(cur.NS, cur.GO, cur.goterm.name,
                                   cur.p_uncorrected, cur.p_fdr_bh)
                                  for cur in g_res if cur.p_fdr_bh <= 0.05]
                    if len(GO_results) > 0:
                        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(
                            *GO_results)

                if len(enrichment_lists) != 0:
                    for j, cur in enumerate(enrichment_lists):
                        go_terms.append(
                            enrichment_list_file_names[j].split(".")[0])
                        uncorrectd_pvals.append(
                            calc_HG_test(
                                [x.split(".")[0] for x in tested_gene_list],
                                [x.split(".")[0] for x in cur], [
                                    x.split(".")[0]
                                    for x in gene_headers_row_cluster
                                ]))
                        FDRs.append(".")
                        go_names.append(".")
                        go_ns.append(".")

                output_rows.append((desc, "\r\n".join([
                    x.split(".")[0] for x in gene_headers_row_cluster
                ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms),
                                    "\r\n".join(go_names),
                                    "\r\n".join(map(str, uncorrectd_pvals)),
                                    "\r\n".join(map(str, FDRs))))

        gene_sorted_heatmap = np.rot90(np.flip(
            gene_expression_top_var[cluster_labels.argsort(), :], 1),
                                       k=-1,
                                       axes=(1, 0))
        find_clusters(end_k,
                      gene_sorted_heatmap,
                      gene_expression_top_var_header_columns,
                      start_k,
                      e2g_convertor(gene_expression_top_var_header_rows),
                      tested_gene_list_file_name,
                      labels_assignment=labels_assignment_patients)

        plot_heatmap(gene_expression_top_var,
                     gene_expression_top_var_header_columns,
                     labels_assignment,
                     gene_expression_top_var_header_rows,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1),
                                   k=-1,
                                   axes=(1, 0))
    if cluster_algorithm == "hierarchical":
        df = pd.DataFrame(data=gene_sorted_heatmap,
                          index=gene_expression_top_var_header_columns,
                          columns=gene_expression_top_var_header_rows)

        # correlations = df.corr()
        # correlations_array = np.asarray(df.corr())
        #
        # row_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array), method='average')
        #
        # col_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array.T), method='average')

        # enrichment_gene_list = load_gene_list("uvm_mito_part.txt")
        dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg"))
        row_colors = map(dct.get, labels_assignment_patients[0])
        dct = {1: 'b', 2: 'r'}
        gene_expression_top_var_header_rows_trimmed = [
            x.split(".")[0] for x in gene_expression_top_var_header_rows
        ]
        # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed])
        g = sns.clustermap(df,
                           row_colors=row_colors,
                           metric="euclidean",
                           robust=True,
                           method="single")
        # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage,
        #                                          labels=df.index,
        #                                          color_threshold=0.60)
        den_genes = scipy.cluster.hierarchy.dendrogram(
            g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7)
        clusters = get_cluster_classes(den_genes)

        g.savefig(
            os.path.join(constants.BASE_PROFILE, "output",
                         "hierarchical_cluster_{}.png".format(time.time())))

    for cur_labels_assignment_patient in labels_assignment_patients:
        plot_heatmap(gene_sorted_heatmap,
                     gene_expression_top_var_header_rows,
                     [cur_labels_assignment_patient],
                     gene_expression_top_var_header_columns,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    print_to_excel(
        output_rows=output_rows,
        gene_list_file_name=tested_gene_list_file_name.split(".")[0],
        gene_expression_file_name=gene_expression_file_name.split(".")[0],
        var_th_index=var_th_index)
Exemple #24
0
def check_group_enrichment_goatools(tested_gene_file_name,
                                    total_gene_file_name,
                                    th=1):
    if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0:
        return []

    if type(total_gene_file_name) == str:
        total_gene_list = load_gene_list(total_gene_file_name)
    else:
        total_gene_list = total_gene_file_name

    if type(tested_gene_file_name) == str:
        tested_gene_list = load_gene_list(tested_gene_file_name)
    else:
        tested_gene_list = tested_gene_file_name

    if not os.path.exists(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(
            os.path.join(constants.GO_DIR,
                         constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(
                os.path.join(
                    constants.GO_DIR,
                    os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)),
                'rb') as f_in:
            with open(
                    os.path.join(constants.GO_DIR,
                                 constants.GO_ASSOCIATION_FILE_NAME),
                    'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR,
                                           constants.GO_ASSOCIATION_FILE_NAME),
                              no_top=True)

    sw = Stopwatch()
    sw.start()
    g = GOEnrichmentStudy(
        [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
        assoc,
        obo_dag,
        methods=[],
        log=None)  # "bonferroni", "fdr_bh"
    g_res = g.run_study(
        [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)])
    print sw.stop("done GO analysis in ")
    # GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
    #               cur.p_fdr_bh <= 0.05]
    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count,
                   cur.p_uncorrected) for cur in g_res
                  if cur.p_uncorrected <= th]

    hg_report = [{
        HG_GO_ROOT: cur[0],
        HG_GO_ID: cur[1],
        HG_GO_NAME: cur[2],
        HG_VALUE: cur[3],
        HG_PVAL: cur[4],
        HG_QVAL: 1
    } for cur in GO_results]  # , HG_QVAL : cur[5]
    # hg_report.sort(key=lambda x: x[HG_QVAL])
    hg_report.sort(key=lambda x: x[HG_PVAL])

    if len(GO_results) > 0:
        go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals = zip(
            *GO_results)  # , FDRs
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    # output_rows = [("\r\n".join(e2g_convertor(tested_gene_list)),  "\r\n".join(go_ns),
    #                     "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
    #                     "\r\n".join(map(str, FDRs)))]
    # print_to_excel(output_rows, str(tested_gene_file_name)[:10], str(total_gene_file_name)[:10])
    return hg_report
Exemple #25
0
get_ipython().system(
    ' wget http://www.geneontology.org/ontology/subsets/goslim_generic.obo')

obo_fname = download_go_basic_obo()

from goatools.base import download_ncbi_associations

gene2go = download_ncbi_associations()

if goset == 'goslim':
    obodag = GODag("goslim_generic.obo")
else:
    obodag = GODag("go-basic.obo")

geneid2gos = read_ncbi_gene2go("gene2go", taxids=[9606])

levels = [r.level for go, r in obodag.items()]
[(i, levels.count(i)) for i in range(1, 12)]

bad_go = []
for go, r in obodag.iteritems():
    if r.level > cutlvl:
        bad_go.append(go)
bad_go = set(bad_go)

len(bad_go)

for go, r in obodag.items():
    nps = set()
    for p in r._parents:
Exemple #26
0
def goea(gene_ids, gene_symbols, trajectory, cluster, out_dir
         ):  ## list of genes represented by their ensembl id and gene symbol
    ## load ontologies

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    from goatools.obo_parser import GODag
    obodag = GODag("goea/go-basic.obo")

    ## load associations
    from goatools.associations import read_ncbi_gene2go
    geneid2gos_human = read_ncbi_gene2go("goea/gene2go", taxids=[9606])

    ## background gene set
    from goea.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_human

    ## GOEA object
    from goatools.go_enrichment import GOEnrichmentStudy
    goeaobj = GOEnrichmentStudy(
        GeneID2nt_human.keys(),  # List of mouse protein-coding genes
        geneid2gos_human,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}

    for gene_symbol in gene_symbols:
        for id in GeneID2nt_human.keys():
            if GeneID2nt_human[id][5] == gene_symbol:
                geneid2symbol[int(id)] = gene_symbol

    #from PyEntrezId import Conversion
    #for (gene_id, gene_symbol) in zip(gene_ids, gene_symbols):


#    id = Conversion('*****@*****.**')
#        gene_id = id.convert_ensembl_to_entrez(gene_id) ## get entrez
#        geneid2symbol[int(gene_id)] = gene_symbol

## identify which id correspond to the genes in the cluster

## Run GOEA
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
    """
    import rpy2
    from rpy2.robjects import r, pandas2ri

    from rpy2.robjects import pandas2ri
    import rpy2.robjects as robjects
    robjects.r('''
    f <- function(geneNames) {
                library(clusterProfiler)
                kk <- enrichKEGG(geneNames)
                as.data.frame(kk)
        }
    ''')

    r_enrich = robjects.globalenv['f']
    """

    #    print(r_enrich.r_repr())

    gene_names = np.array(list(geneid2symbol.keys()))

    print(gene_names)
    """
    pandas2ri.activate()

    res = r_enrich(gene_names)

    res = r_enrich(gene_names, organism="hsa", pvalueCutoff=0.5, pAdjustMethod="BH", qvalueCutoff=0.1)

    print(res)

    print(pandas2ri.ri2py(res))

    return
    """

    geneids_study = geneid2symbol.keys()

    with open(
            out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
            'genes.txt', 'w') as f:
        for gene in geneids_study:
            f.write("%s\n" % gene)

    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    ## Write the result to file
    goeaobj.wr_xlsx(out_dir + '/' + trajectory[-8:] + 'cluster ' +
                    str(cluster) + 'goea_symbols.xlsx',
                    goea_results_sig,
                    itemid2name=geneid2symbol)
    goeaobj.wr_xlsx(
        out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
        'goea_geneids.xlsx', goea_results_sig)
Exemple #27
0
def get_GO(gene_query, species='mouse'):
    """
    Get Gene Ontologies (GOs).

    Args:
        gene_query (array of str): gene list.

        species (str): Select species. Either "mouse" or "human"

    Returns:
        pandas.dataframe: GO analysis results as dataframe.
    """

    sig_thresh = 3
    num_genes = None
    GOIs = gene_query

    # prepare files
    # check files
    _check_data_and_download_if_necessary(go_folder)

    obodag = GODag(os.path.join(go_folder, "go-basic.obo"))

    #go analysis

    if (species == 'human'):

        geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"),
                                       taxids=[9606])
        print("{N:,} annotated genes".format(N=len(geneid2gos)))

        Xtable = pd.read_csv(os.path.join(go_folder, 'hg19_xref.txt'),
                             sep='\t')
        Xtable.index = Xtable['Approved Symbol']
        GOIs_entrez = [
            int(x)
            for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID'])
        ]

    elif (species == 'mouse'):

        geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"),
                                       taxids=[10090])
        print("{N:,} annotated genes".format(N=len(geneid2gos)))

        from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus

        Xtable = pd.read_csv(os.path.join(go_folder, 'biomart_xref.mm10.txt'),
                             sep='\t')
        Xtable = Xtable[['Associated Gene Name', 'EntrezGene ID']].dropna()
        Xtable.index = Xtable['Associated Gene Name']
        GOIs_entrez = [
            int(x)
            for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID'])
        ]

    print("processing " + str(len(GOIs)) + " genes ...")

    goeaobj = GOEnrichmentStudy(
        GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
        geneid2gos,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    goea_results = goeaobj.run_study(GOIs_entrez)

    li = []
    names = []

    go_default_output = goea_results[0].get_prtflds_default()

    for i in goea_results:
        li.append(i.get_field_values(go_default_output))
        names.append(i.name)

    df_GO = pd.DataFrame(li)

    if len(li) != 0:
        df_GO.columns = go_default_output
        df_GO["genes"] = df_GO.study_items.apply(
            lambda x: _ids2symbols(x, species))
    else:
        print("Found No GO with significant p-value")

    return df_GO
Exemple #28
0
def check_group_enrichment(tested_gene_file_name,
                           total_gene_file_name,
                           go_folder,
                           th=1):
    if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0:
        return []

    if type(total_gene_file_name) == str:
        total_gene_list = load_gene_list(total_gene_file_name)
    else:
        total_gene_list = total_gene_file_name

    if type(tested_gene_file_name) == str:
        tested_gene_list = load_gene_list(tested_gene_file_name)
    else:
        tested_gene_list = tested_gene_file_name

    if not os.path.exists(os.path.join(go_folder, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(go_folder, constants.GO_FILE_NAME))

    if not os.path.exists(
            os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)):
        if not os.path.exists(
                os.path.join(go_folder,
                             constants.GO_ASSOCIATION_FILE_NAME + ".gz")):
            download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(
                os.path.join(
                    go_folder,
                    os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)),
                'rb') as f_in:
            with open(
                    os.path.join(go_folder,
                                 constants.GO_ASSOCIATION_FILE_NAME),
                    'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    global assoc
    if assoc is None:
        assoc = read_ncbi_gene2go(os.path.join(
            go_folder, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)

    g = GOEnrichmentStudy(
        [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
        assoc,
        obo_dag,
        log=None)  # "bonferroni", "fdr_bh"
    g_res = g.run_study(
        [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count,
                   cur.p_uncorrected)
                  for cur in g_res]  # , cur.p_fdr_bh    if cur.p_fdr_bh <= th

    hg_report = [{
        HG_GO_ROOT: cur[0],
        HG_GO_ID: cur[1],
        HG_GO_NAME: cur[2],
        HG_VALUE: cur[3],
        HG_PVAL: cur[4]
    } for cur in GO_results]  # , HG_QVAL : cur[5]
    hg_report.sort(key=lambda x: x[HG_PVAL])  # HG_QVAL

    return hg_report
Exemple #29
0
 def parse_gene2go_info(self, taxids):
     full_gene2go_dict = read_ncbi_gene2go(self.gene2go_path, taxids=taxids)
     return full_gene2go_dict
Exemple #30
0
 def __init__(self,
              go_path="data/go/go-basic.obo",
              gene2go_path="data/go/gene2go"):
     self.obo = GODag(go_path)
     self.gene_to_go = read_ncbi_gene2go(gene2go_path, go2geneids=False)
Exemple #31
0
        query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol',
                                 species='human', entrezonly=True, as_dataframe=True,
                                 df_index=False, verbose=False)
        if 'notfound' in query_res.columns:
            query_res = query_res[query_res.notfound != True]  # ignore PEP8 warnings.
        query_result_list.append(query_res)
    df_res = pd.concat(query_result_list)
    res = dict(zip(df_res.entrezgene, df_res.symbol))
    return res


with HidePrints():
    _go_dag = obo_parser.GODag(go_obo_path)
    _gaf = read_gaf(gaf_path, prt=None)
    _termcounts = TermCounts(_go_dag, _gaf)
    _gene2go = read_ncbi_gene2go(gene2go_path)
    _gene2symbol = _init_gene2symbol_dict()
    _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()}


def get_genes():
    return list(_gene2go.keys())


def get_symbols():
    return list(_gene2symbol.values())


def get_gene2go():
    return _gene2go
Exemple #32
0
# Data will be stored in this variable
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import goatools
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus
from goatools.go_enrichment import GOEnrichmentStudy

obo_fname = download_go_basic_obo()
gene2go = download_ncbi_associations()
obodag = GODag("go-basic.obo")
geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090])

geneid2symbol = {}

print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse)))
print(GeneID2nt_mus.keys().head())

goeaobj = GOEnrichmentStudy(
    GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
    geneid2gos_mouse,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method
def invert_dol_nonunique(d):
    newdict = {}
    for k in d:
        for v in d[k]:
            newdict.setdefault(v, []).append(k)
    return newdict


PATH_TO_MIMOSCA = os.path.expanduser(
    os.path.join("~", "Desktop", "programs", "MIMOSCA-master"))
path2db = os.path.expanduser(
    os.path.join(PATH_TO_MIMOSCA, "common_files", "data")) + os.path.sep
path2Xref = os.path.expanduser(
    os.path.join(PATH_TO_MIMOSCA, "common_files", "data")) + os.path.sep

geneid2gos = read_ncbi_gene2go(path2db + "gene2go", taxids=[9606])
go2idx = invert_dol_nonunique(geneid2gos)

Xtable = pd.read_csv(os.path.join(path2Xref, 'hg19_xref.txt'), sep='\t')
idx2gene = {
    int(idx): symbol
    for idx, symbol in zip(Xtable['EntrezGene ID'], Xtable['Approved Symbol'])
    if not math.isnan(idx)
}


def go2genes(go, sep=';'):
    indices = go2idx.get(go)
    if indices is None:
        return ''
    else: