pop_mgi = set(dfQ['MGI ID'].dropna().tolist())
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_mgi.union(pop_uniprot)
    elif layer == 'HS':
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_uniprot

    # Load GO
    print("Load GO files")
    godag = obo_parser.GODag(ontology)

    # GAF files for both MM and DM
    gaf_species = GafReader(name='GAF ' + layer + ' Specie', filename=annotation, godag=godag, namespaces=set(['BP']))
    gaf_reactome = GafReader(name='GAF ' + layer + ' Reactome', filename=annotation_reactome, godag=godag, namespaces=set(['BP']))
    # Dict of Associations
    ns2assoc_species = gaf_species.get_ns2assc()
    n_assoc_species = sum([len(v) for k, v in ns2assoc_species['BP'].items()])
    print('Specie associations: {n:d}'.format(n=n_assoc_species))

    # We also need to add the multi-species annotations
    ns2assoc_reactome = gaf_reactome.get_ns2assc()
    n_assoc_reactome = sum([len(v) for k, v in ns2assoc_reactome['BP'].items()])
    print('Reactome associations: {n:d}'.format(n=n_assoc_reactome))

    # combine associations
    ns2assoc_combined = merge(ns2assoc_species, ns2assoc_reactome)
    n_assoc_combined = sum([len(v) for k, v in ns2assoc_combined['BP'].items()])
    print('Combined associations: {n:d}'.format(n=n_assoc_combined))

    """
    # Use PANTHER HMM for HS?
Beispiel #2
0
    layer = 'DM'
    threshold_str = str(threshold).replace('.', 'p')

    # GO Information
    dict_annotation_file = {
        'HS': 'goa_human.gaf',
        'MM': 'mgi.gaf',
        'DM': 'fb.gaf'
    }
    annotation = '../data/GeneOntology/' + dict_annotation_file[layer]
    ontology = '../data/GeneOntology/go-basic.obo'
    #
    godag = obo_parser.GODag(ontology)
    gaf = GafReader(name='GAF ' + layer, filename=annotation, godag=godag)
    # Dict of Associations
    ns2assoc = gaf.get_ns2assc()

    # Load Population of Genes (for background comparison)
    rFPKMFile = '../02-core_genes/results/FPKM/{layer:s}/{layer:s}-FPKM-{celltype:s}.csv.gz'.format(
        celltype=celltype, layer=layer)
    dfP = pd.read_csv(rFPKMFile, usecols=['id_gene', 'gene'])

    # Load PCA
    rPCAFile = 'results/pca/{celltype:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-dim.csv.gz'.format(
        celltype=celltype,
        network=network,
        threshold=threshold_str,
        layer=layer)
    df_pca = pd.read_csv(rPCAFile, index_col=0, encoding='utf-8')

    # Population of genes (background) to test against