Esempio n. 1
0
def main():

    # reference interactome name
    # options: HI-II-14, IntAct
    ref_interactome_name = 'HI-II-14'

    # similarity measure used to calculate GO similarity
    sim_measure = 'SimGIC'

    # root ontologies on which GO similarity was calculated
    ont_root = [
        'biological_process', 'molecular_function', 'cellular_component'
    ]

    # root ontology labels used to label output files and figures
    ont_abv = {
        'biological_process': 'bp',
        'molecular_function': 'mf',
        'cellular_component': 'cc'
    }

    # structural interactome names
    struc_name = {'HI-II-14': 'Y2H-SI', 'IntAct': 'IntAct-SI'}

    # interactome names
    interactome_names = [
        'Random interactions', 'Reference interactome',
        'Structural interactome'
    ]

    # interactome colors
    interactome_colors = ['limegreen', 'steelblue', 'orangered']

    # show figures
    showFigs = False

    # parent directory of all data files
    dataDir = Path('../data')

    # parent directory of processed data files
    procDir = dataDir / 'processed'

    # directory of processed data files specific to interactome
    interactomeDir = procDir / ref_interactome_name

    # figure directory
    figDir = Path('../figures') / ref_interactome_name / 'gosim'

    # input data files
    gosimFiles = [
        interactomeDir / 'gosim' / ('allPPI_gosim_%s_%s.pkl' %
                                    (ont_abv[root], sim_measure))
        for root in ont_root
    ]

    # create output directories if not existing
    if not figDir.exists():
        os.makedirs(figDir)

    allgosim = {}
    for root, gosimFile in zip(ont_root, gosimFiles):
        with open(gosimFile, 'rb') as f:
            allgosim[root] = pickle.load(f)

    means, errors = [], []
    for interactome_name in interactome_names:
        gosim = [
            allgosim[root][interactome_name]["gosim"] for root in ont_root
        ]
        means.append([np.mean(s) for s in gosim])
        errors.append([sderror(s) for s in gosim])

    multi_bar_plot(means,
                   errors=errors,
                   xlabels=[r.title().replace('_', '\n') for r in ont_root],
                   ylabel='Gene ontology similarity\nof interaction partners',
                   ylabels=[round(x, 1) for x in np.arange(0, 0.6, 0.1)],
                   colors=interactome_colors,
                   edgecolor='k',
                   ewidth=1.5,
                   barwidth=0.2,
                   bargap=0.03,
                   fontsize=18,
                   capsize=5,
                   msize=8,
                   leg=[
                       'Random pairs',
                       '%s reference interactome' % ref_interactome_name,
                       struc_name[ref_interactome_name]
                   ],
                   show=showFigs,
                   figdir=figDir,
                   figname='interactome_gosim_%s' % sim_measure)
def main():

    # reference interactome name
    # options: HI-II-14, IntAct
    interactome_name = 'HI-II-14'

    # similarity measure to calculate GO similarity
    # options: Resnik, Lin, Jiang-Conrath, SimGIC, SimUI, SimIC, SimRel, Dice, SimTO
    #           SimNTO, Jaccard, Czekanowski-Dice, Cosine, GSESAME, SimICND, SimICNP
    sim_measure = 'SimGIC'

    # mixing strategy for merging GO term semantic similarities
    # options: max, avg, BMA (best match average)
    mix_method = 'BMA'

    # root ontology on which GO similarity is calculated
    # options: biological_process, molecular_function, cellular_component
    ont_root = 'biological_process'

    # root ontology labels used to label output files and figures
    ont_abv = {
        'biological_process': 'bp',
        'molecular_function': 'mf',
        'cellular_component': 'cc'
    }

    # list of ontological relationships to ignore
    ont_ignore = None

    # list of evidence codes to ignore
    ec_ignore = None

    # number of random interactions
    numRandPairs = 10000

    # interactome colors
    interactome_colors = ['limegreen', 'steelblue', 'orangered']

    # show figures
    showFigs = False

    # parent directory of all data files
    dataDir = Path('../data')

    # directory of data files from external sources
    extDir = dataDir / 'external'

    # parent directory of all processed data files
    procDir = dataDir / 'processed'

    # directory of processed data files specific to interactome
    interactomeDir = procDir / interactome_name

    # directory of GO similarity output data files
    gosimDir = interactomeDir / 'gosim'

    # figure directory
    figDir = Path('../figures') / interactome_name / 'gosim'

    # get root ontology label to label output files and figures
    ont_label = ont_abv[ont_root]

    # input data files
    ontologyFile = extDir / 'go-basic.obo'
    annotationFile = extDir / 'goa_human.gaf'
    interactomeFile = interactomeDir / 'human_interactome.txt'
    structuralInteractomeFile = interactomeDir / 'human_interface_annotated_interactome.txt'

    # output data files
    refPPIListFile = gosimDir / 'refPPIs.txt'
    randPairListFile = gosimDir / 'randPairs.txt'
    refPPIgosimParamFile = gosimDir / ('fastsemsim_parameters_refPPIs_%s_%s' %
                                       (ont_label, sim_measure))
    randPairgosimParamFile = gosimDir / (
        'fastsemsim_parameters_randPairs_%s_%s' % (ont_label, sim_measure))
    refPPIfastsemsimOutFile = gosimDir / ('fastsemsim_output_refPPIs_%s_%s' %
                                          (ont_label, sim_measure))
    randPairfastsemsimOutFile = gosimDir / (
        'fastsemsim_output_randPairs_%s_%s' % (ont_label, sim_measure))
    refPPIgosimFile = gosimDir / ('gosim_refPPIs_%s_%s.pkl' %
                                  (ont_label, sim_measure))
    randPairgosimFile = gosimDir / ('gosim_randPairs_%s_%s.pkl' %
                                    (ont_label, sim_measure))
    allPPIgosimFile = gosimDir / ('allPPI_gosim_%s_%s.pkl' %
                                  (ont_label, sim_measure))

    # create directories if not existing
    if not gosimDir.exists():
        os.makedirs(gosimDir)
    if not figDir.exists():
        os.makedirs(figDir)

    #------------------------------------------------------------------------------------
    # load reference and structural interactomes
    #------------------------------------------------------------------------------------

    interactome = pd.read_table(interactomeFile)
    interactomeProteins = list(
        set(interactome[["Protein_1", "Protein_2"]].values.flatten()))
    print('\n' + 'reference interactome:')
    print('%d PPIs' % len(interactome))
    print('%d proteins' % len(interactomeProteins))

    structuralInteractome = read_single_interface_annotated_interactome(
        structuralInteractomeFile)
    strucInteractomeProteins = list(
        set(structuralInteractome[["Protein_1",
                                   "Protein_2"]].values.flatten()))
    print('\n' + 'interface-annotated interactome:')
    print('%d PPIs' % len(structuralInteractome))
    print('%d proteins' % len(strucInteractomeProteins))

    #------------------------------------------------------------------------------------
    # Calculate GO similarity for random interactions
    #------------------------------------------------------------------------------------

    randPairs = pd.DataFrame()
    randPairs["Protein_1"], randPairs["Protein_2"] = zip(
        *sample_random_pairs(interactomeProteins, numRandPairs))
    randPairs = randPairs[randPairs["Protein_1"] != randPairs["Protein_2"]]

    # produce protein GO similarity dictionary
    print('\n' + 'producing GO similarity dictionary for random pairs')
    randPairs[["Protein_1", "Protein_2"]].to_csv(randPairListFile,
                                                 index=False,
                                                 header=False,
                                                 sep='\t')
    produce_fastsemsim_protein_gosim_dict(
        randPairListFile,
        randPairgosimFile,
        sim_measure=sim_measure,
        mix_method=mix_method,
        ont_root=ont_root,
        ont_ignore=ont_ignore,
        ec_ignore=ec_ignore,
        ontologyFile=ontologyFile,
        annotationFile=annotationFile,
        paramOutFile=randPairgosimParamFile,
        fastsemsimOutFile=randPairfastsemsimOutFile)
    with open(randPairgosimFile, 'rb') as f:
        gosim = pickle.load(f)

    sim = []
    for p in map(tuple,
                 map(sorted, randPairs[["Protein_1", "Protein_2"]].values)):
        sim.append(gosim[p] if p in gosim else np.nan)
    randPairs["gosim"] = sim
    randPairs = randPairs[np.isnan(randPairs["gosim"]) == False].reset_index(
        drop=True)

    #------------------------------------------------------------------------------------
    # Calculate GO similarity for all interaction partners in the reference interactome
    #------------------------------------------------------------------------------------

    refPPIs = pd.DataFrame()
    refPPIs["Protein_1"], refPPIs["Protein_2"] = zip(
        *interactome[["Protein_1", "Protein_2"]].values)

    # produce protein GO similarity dictionary
    if not refPPIgosimFile.is_file():
        print('\n' + 'producing GO similarity dictionary for reference PPIs')
        refPPIs[["Protein_1", "Protein_2"]].to_csv(refPPIListFile,
                                                   index=False,
                                                   header=False,
                                                   sep='\t')
        produce_fastsemsim_protein_gosim_dict(
            refPPIListFile,
            refPPIgosimFile,
            sim_measure=sim_measure,
            mix_method=mix_method,
            ont_root=ont_root,
            ont_ignore=ont_ignore,
            ec_ignore=ec_ignore,
            ontologyFile=ontologyFile,
            annotationFile=annotationFile,
            paramOutFile=refPPIgosimParamFile,
            fastsemsimOutFile=refPPIfastsemsimOutFile)
    with open(refPPIgosimFile, 'rb') as f:
        gosim = pickle.load(f)

    sim = []
    for p in map(tuple, map(sorted, refPPIs[["Protein_1",
                                             "Protein_2"]].values)):
        sim.append(gosim[p] if p in gosim else np.nan)
    refPPIs["gosim"] = sim
    refPPIs = refPPIs[np.isnan(refPPIs["gosim"]) == False].reset_index(
        drop=True)

    #------------------------------------------------------------------------------------
    # Calculate GO similarity for all interaction partners in the structural interactome
    #------------------------------------------------------------------------------------

    strucPPIs = pd.DataFrame()
    strucPPIs["Protein_1"], strucPPIs["Protein_2"] = zip(
        *structuralInteractome[["Protein_1", "Protein_2"]].values)
    sim = []
    for p in map(tuple,
                 map(sorted, strucPPIs[["Protein_1", "Protein_2"]].values)):
        sim.append(gosim[p] if p in gosim else np.nan)
    strucPPIs["gosim"] = sim
    strucPPIs = strucPPIs[np.isnan(strucPPIs["gosim"]) == False].reset_index(
        drop=True)

    #------------------------------------------------------------------------------------
    # Save GO similarity results to file
    #------------------------------------------------------------------------------------

    allgosim = {
        k: sim
        for k, sim in zip([
            'Random interactions', 'Reference interactome',
            'Structural interactome'
        ], [randPairs, refPPIs, strucPPIs])
    }
    with open(allPPIgosimFile, 'wb') as fout:
        pickle.dump(allgosim, fout)

    #------------------------------------------------------------------------------------
    # Compare GO similarity of interaction partners between reference interactome,
    # structural interactome and random interactions
    #------------------------------------------------------------------------------------

    # remove NaN values
    randGOsim = randPairs["gosim"].tolist()
    refGOsim = refPPIs["gosim"].tolist()
    strucGOsim = strucPPIs["gosim"].tolist()

    # print results
    print('\n' + 'Mean %s similarity for interaction partners:' % ont_root)
    print('Random pairs: %f (SE = %g, n = %d)' %
          (np.mean(randGOsim), sderror(randGOsim), len(randGOsim)))
    print('Reference interactome: %f (SE = %g, n = %d)' %
          (np.mean(refGOsim), sderror(refGOsim), len(refGOsim)))
    print('Structural interactome: %f (SE = %g, n = %d)' %
          (np.mean(strucGOsim), sderror(strucGOsim), len(strucGOsim)))
    print('\n' + 'Statistical significance')
    print('reference interactome vs random interactions:')
    bootstrap_test(refGOsim, randGOsim, iter=100000)
    print('structural interactome vs reference interactome:')
    bootstrap_test(strucGOsim, refGOsim, iter=100000)

    # plot results
    bar_plot([np.mean(randGOsim),
              np.mean(refGOsim),
              np.mean(strucGOsim)],
             [sderror(randGOsim),
              sderror(refGOsim),
              sderror(strucGOsim)],
             xlabels=[
                 'Random\ninteractions', 'Reference\ninteractome',
                 'Structural\ninteractome'
             ],
             ylabel='%s similarity of\ninteraction partners' % ont_root,
             colors=interactome_colors,
             edgecolor='k',
             barwidth=0.5,
             fontsize=24,
             show=showFigs,
             figdir=figDir,
             figname='interactome_gosim_%s_%s' % (ont_label, sim_measure))
def main():

    # reference interactome name
    # options: HI-II-14, IntAct
    ref_interactome_name = 'HI-II-14'

    # tissue expression databases
    expr_db = ['Illumina', 'GTEx', 'HPA', 'Fantom5']

    # structural interactome names
    struc_name = {'HI-II-14': 'Y2H-SI', 'IntAct': 'IntAct-SI'}

    # interactome names
    interactome_names = [
        'Random interactions', 'Reference interactome',
        'Structural interactome'
    ]

    # interactome colors
    interactome_colors = ['limegreen', 'steelblue', 'orangered']

    # show figures
    showFigs = False

    # parent directory of all data files
    dataDir = Path('../data')

    # parent directory of processed data files
    procDir = dataDir / 'processed'

    # directory of processed data files specific to interactome
    interactomeDir = procDir / ref_interactome_name

    # figure directory
    figDir = Path('../figures') / ref_interactome_name / 'coexpr'

    # input data files
    coexprFiles = [
        interactomeDir / 'coexpr' / ('interactome_coexpr_%s.pkl' % db)
        for db in expr_db
    ]

    # create output directories if not existing
    if not figDir.exists():
        os.makedirs(figDir)

    allcoexpr = {}
    for db, coexprFile in zip(expr_db, coexprFiles):
        with open(coexprFile, 'rb') as f:
            allcoexpr[db] = pickle.load(f)

    means, errors = [], []
    for interactome_name in interactome_names:
        coexpr = [allcoexpr[db][interactome_name]["coexpr"] for db in expr_db]
        means.append([np.mean(c) for c in coexpr])
        errors.append([sderror(c) for c in coexpr])

    multi_bar_plot(means,
                   errors=errors,
                   xlabels=expr_db,
                   ylabel='Tissue co-expression of\ninteraction partners',
                   ylabels=[round(x, 1) for x in np.arange(0, 0.9, 0.2)],
                   colors=interactome_colors,
                   edgecolor='k',
                   ewidth=1.5,
                   barwidth=0.2,
                   bargap=0.03,
                   fontsize=18,
                   capsize=5,
                   msize=8,
                   leg=[
                       'Random pairs',
                       '%s reference interactome' % ref_interactome_name,
                       struc_name[ref_interactome_name]
                   ],
                   show=showFigs,
                   figdir=figDir,
                   figname='interactome_coexpr')
def main():
    
    # substitution matrix name
    matrixName = 'PAM30'
    
    # show figures
    showFigs = False
    
    # parent directory of all data files
    dataDir = Path('../data')
    
    # parent directory of all processed data files
    procDir = dataDir / 'processed'
    
    # figure directory
    figDir = Path('../figures') / 'combined'
    
    # input data files
    naturalMutationsFile = procDir / 'dbsnp_mutations4.txt'
    diseaseMutationsFile = procDir / 'clinvar_mutations6.txt'
    
    # output data files
    subsMatrixFile = procDir / (matrixName + '.pkl')
    
    # create output directories if not existing
    if not procDir.exists():
        os.makedirs(procDir)
    if not figDir.exists():
        os.makedirs(figDir)
    
    #------------------------------------------------------------------------------------
    # produce substitution matrix
    #------------------------------------------------------------------------------------
    
    if not subsMatrixFile.is_file():
        produce_substitution_matrix (matrixName, subsMatrixFile)
    with open(subsMatrixFile, 'rb') as f:
        subsTable = pickle.load(f)
    
    #------------------------------------------------------------------------------------
    # load  mutations
    #------------------------------------------------------------------------------------
    
    naturalMutations, diseaseMutations = remove_mutation_overlaps (naturalMutationsFile, diseaseMutationsFile)
    
    #------------------------------------------------------------------------------------
    # calculate mutation substitution scores
    #------------------------------------------------------------------------------------
    
    natMutScore = [subsTable[x] for x in zip(naturalMutations["wt_res"], naturalMutations["mut_res"])]
    disMutScore = [subsTable[x] for x in zip(diseaseMutations["wt_res"], diseaseMutations["mut_res"])]
    
    print()
    print( 'Avg. score for natural mutations: %.1f (SE = %g, n = %d)' % (np.mean(natMutScore),
                                                                         sderror(natMutScore),
                                                                         len(natMutScore)) )
    print( 'Avg. score for disease mutations: %.1f (SE = %g, n = %d)' % (np.mean(disMutScore),
                                                                         sderror(disMutScore),
                                                                         len(disMutScore)) )
    
    bootstrap_test (natMutScore, disMutScore, 10000)
    
    box_plot([natMutScore, disMutScore],
             xlabels = ('Non-disease\nmutations','Disease\nmutations'),
             ylabels = [-15, -10, -5, 0, 5],
             ylabel = 'PAM30 substitution score',
             ylim = [-16, 5],
             colors = ['turquoise', 'magenta'],
             fontsize = 26,
             show = showFigs,
             figdir = figDir,
             figname = 'substitution_score')
def main():

    # reference interactome name
    # options: HI-II-14, IntAct
    interactome_name = 'HI-II-14'

    # tissue expression database name
    # options: Illumina, GTEx, HPA, Fantom5
    expr_db = 'Illumina'

    # minimum number of tissue expression values required for protein pair tissue
    # co-expression to be considered
    coexprMinTissues = 5

    # number of random interactions
    numRandPairs = 10000

    # interactome colors
    interactome_colors = ['limegreen', 'steelblue', 'orangered']

    # show figures
    showFigs = False

    # parent directory of all data files
    dataDir = Path('../data')

    # directory of data files from external sources
    extDir = dataDir / 'external'

    # parent directory of all processed data files
    procDir = dataDir / 'processed'

    # directory of processed data files specific to interactome
    interactomeDir = procDir / interactome_name

    # directory of tissue coexpression output data files
    coexprDir = interactomeDir / 'coexpr'

    # figure directory
    figDir = Path('../figures') / interactome_name / 'coexpr'

    # input data files
    illuminaExprFile = extDir / 'E-MTAB-513.tsv.txt'
    gtexDir = extDir / 'GTEx_Analysis_v7_eQTL_expression_matrices'
    hpaExprFile = extDir / 'normal_tissue.tsv'
    fantomExprFile = extDir / 'hg38_fair+new_CAGE_peaks_phase1and2_tpm_ann.osc.txt'
    fantomSampleTypeFile = extDir / 'fantom5_sample_type.xlsx'
    uniprotIDmapFile = procDir / 'to_human_uniprotID_map.pkl'
    uniqueGeneSwissProtIDFile = procDir / 'uniprot_unique_gene_reviewed_human_proteome.list'
    interactomeFile = interactomeDir / 'human_interactome.txt'
    structuralInteractomeFile = interactomeDir / 'human_interface_annotated_interactome.txt'

    # output data files
    proteinExprFile = procDir / ('protein_expr_%s.pkl' % expr_db)
    coexprFile = coexprDir / ('interactome_coexpr_%s.pkl' % expr_db)

    # create directories if not existing
    if not coexprDir.exists():
        os.makedirs(coexprDir)
    if not figDir.exists():
        os.makedirs(figDir)

    #------------------------------------------------------------------------------------
    # load reference and structural interactomes
    #------------------------------------------------------------------------------------

    interactome = pd.read_table(interactomeFile)
    print('\n' + 'reference interactome:')
    print('%d PPIs' % len(interactome))
    print('%d proteins' %
          len(set(interactome[["Protein_1", "Protein_2"]].values.flatten())))

    structuralInteractome = read_single_interface_annotated_interactome(
        structuralInteractomeFile)
    print('\n' + 'interface-annotated interactome:')
    print('%d PPIs' % len(structuralInteractome))
    print('%d proteins' % len(
        set(structuralInteractome[["Protein_1", "Protein_2"
                                   ]].values.flatten())))

    #------------------------------------------------------------------------------------
    # Produce tissue expression dictionary
    #------------------------------------------------------------------------------------

    # produce protein tissue expression profiles
    if not proteinExprFile.is_file():
        print('\n' + 'producing protein tissue expression dictionary')
        if expr_db is 'Illumina':
            produce_illumina_expr_dict(illuminaExprFile,
                                       uniprotIDmapFile,
                                       proteinExprFile,
                                       headers=list(range(1, 18)))
        elif expr_db is 'GTEx':
            produce_gtex_expr_dict(gtexDir,
                                   uniprotIDmapFile,
                                   proteinExprFile,
                                   uniprotIDlistFile=uniqueGeneSwissProtIDFile)
        elif expr_db is 'HPA':
            produce_hpa_expr_dict(hpaExprFile, uniprotIDmapFile,
                                  proteinExprFile)
        elif expr_db is 'Fantom5':
            produce_fantom5_expr_dict(
                fantomExprFile,
                uniprotIDmapFile,
                proteinExprFile,
                sampleTypes='tissues',
                sampleTypeFile=fantomSampleTypeFile,
                uniprotIDlistFile=uniqueGeneSwissProtIDFile)

    with open(proteinExprFile, 'rb') as f:
        expr = pickle.load(f)

    if expr_db is 'HPA':
        exprMap = {'Not detected': 0, 'Low': 1, 'Medium': 2, 'High': 3}
        for k, v in expr.items():
            for i, e in enumerate(v):
                v[i] = exprMap[e] if e in exprMap else np.nan
            expr[k] = np.array(v)

    #-----------------------------------------------------------------------------------------
    # Calculate tissue co-expression for random interactions
    #-----------------------------------------------------------------------------------------

    proteins = list(
        set(interactome[["Protein_1", "Protein_2"]].values.flatten()))
    randPairs = pd.DataFrame()
    randPairs["Protein_1"], randPairs["Protein_2"] = zip(
        *sample_random_pairs(proteins, numRandPairs))
    randPairs = randPairs[randPairs["Protein_1"] != randPairs["Protein_2"]]
    randPairs["coexpr"] = randPairs.apply(lambda x: coexpr(
        x["Protein_1"], x["Protein_2"], expr, minTissues=coexprMinTissues),
                                          axis=1)
    randPairs = randPairs[np.isnan(randPairs["coexpr"]) == False].reset_index(
        drop=True)

    #-----------------------------------------------------------------------------------------
    # Calculate tissue co-expression for all interaction partners in the reference interactome
    #-----------------------------------------------------------------------------------------

    refPPIs = pd.DataFrame()
    refPPIs["Protein_1"] = interactome["Protein_1"].tolist()
    refPPIs["Protein_2"] = interactome["Protein_2"].tolist()
    refPPIs["coexpr"] = refPPIs.apply(lambda x: coexpr(
        x["Protein_1"], x["Protein_2"], expr, minTissues=coexprMinTissues),
                                      axis=1)
    refPPIs = refPPIs[np.isnan(refPPIs["coexpr"]) == False].reset_index(
        drop=True)

    #------------------------------------------------------------------------------------------
    # Calculate tissue co-expression for all interaction partners in the structural interactome
    #------------------------------------------------------------------------------------------

    strucPPIs = pd.DataFrame()
    strucPPIs["Protein_1"] = structuralInteractome["Protein_1"].tolist()
    strucPPIs["Protein_2"] = structuralInteractome["Protein_2"].tolist()
    strucPPIs["coexpr"] = strucPPIs.apply(lambda x: coexpr(
        x["Protein_1"], x["Protein_2"], expr, minTissues=coexprMinTissues),
                                          axis=1)
    strucPPIs = strucPPIs[np.isnan(strucPPIs["coexpr"]) == False].reset_index(
        drop=True)

    #-------------------------------------------------------------------------------------
    # Save tissue co-expression results to file
    #-------------------------------------------------------------------------------------

    allcoexpr = {
        k: c
        for k, c in zip([
            'Random interactions', 'Reference interactome',
            'Structural interactome'
        ], [randPairs, refPPIs, strucPPIs])
    }
    with open(coexprFile, 'wb') as fout:
        pickle.dump(allcoexpr, fout)

    #------------------------------------------------------------------------------------
    # Compare tissue co-expression of interaction partners between reference interactome,
    # structural interactome and random interactions
    #------------------------------------------------------------------------------------

    # remove NaN values
    randCoexpr = randPairs["coexpr"].tolist()
    refCoexpr = refPPIs["coexpr"].tolist()
    strucCoexpr = strucPPIs["coexpr"].tolist()

    # print results
    print('\n' + 'Mean tissue co-expression for interaction partners:')
    print('Random pairs: %f (SE = %g, n = %d)' %
          (np.mean(randCoexpr), sderror(randCoexpr), len(randCoexpr)))
    print('Reference interactome: %f (SE = %g, n = %d)' %
          (np.mean(refCoexpr), sderror(refCoexpr), len(refCoexpr)))
    print('Structural interactome: %f (SE = %g, n = %d)' %
          (np.mean(strucCoexpr), sderror(strucCoexpr), len(strucCoexpr)))
    print('\n' + 'Statistical significance')
    print('reference interactome vs random interactions:')
    bootstrap_test(refCoexpr, randCoexpr, iter=100000)
    print('structural interactome vs reference interactome:')
    bootstrap_test(strucCoexpr, refCoexpr, iter=100000)

    # plot results
    bar_plot([np.mean(randCoexpr),
              np.mean(refCoexpr),
              np.mean(strucCoexpr)],
             [sderror(randCoexpr),
              sderror(refCoexpr),
              sderror(strucCoexpr)],
             xlabels=[
                 'Random\ninteractions', 'Reference\ninteractome',
                 'Structural\ninteractome'
             ],
             ylabel='Tissue co-expression of\ninteraction partners',
             colors=interactome_colors,
             edgecolor='k',
             barwidth=0.5,
             fontsize=24,
             show=showFigs,
             figdir=figDir,
             figname='interactome_coexpression_%s' % expr_db)
def main():

    # reference interactome name
    # options: HI-II-14, IntAct
    interactome_name = 'HI-II-14'

    # structural interactome names
    struc_name = {'HI-II-14': 'Y2H-SI', 'IntAct': 'IntAct-SI'}

    # method of calculating mutation ∆∆G for which results will be used
    # options: bindprofx, foldx
    ddg_method = 'bindprofx'

    # Minimum reduction in binding free energy DDG required for interaction perturbation
    ddgCutoff = 0.5

    # % confidence interval
    CI = 95

    # plot perturbed interactome and produce files for use by Cytoscape
    plot_perturbations = False

    # show figures
    showFigs = False

    # parent directory of all data files
    dataDir = Path('../data')

    # parent directory of all processed data files
    procDir = dataDir / 'processed'

    # directory of processed data files specific to interactome
    interactomeDir = procDir / interactome_name

    # directory of network perturbation output data files for use by Cytoscape
    cytoscapeDir = interactomeDir / 'cytoscape'

    # figure directory
    figDir = Path('../figures') / interactome_name

    # input data files
    geometryPerturbsFile = interactomeDir / 'unique_mutation_perturbs_geometry.pkl'
    natMutDDGinFile = interactomeDir / ('nondisease_mutations_%s_ddg.txt' %
                                        ddg_method)
    disMutDDGinFile = interactomeDir / ('disease_mutations_%s_ddg.txt' %
                                        ddg_method)
    interfaceAnnotatedInteractomeFile = interactomeDir / 'human_interface_annotated_interactome.txt'

    # output data files
    natMutDDGoutFile = interactomeDir / ('nondisMut_%s_∆∆G_used.txt' %
                                         ddg_method)
    disMutDDGoutFile = interactomeDir / ('disMut_%s_∆∆G_used.txt' % ddg_method)
    physicsPerturbsFile = interactomeDir / (
        'mutation_perturbs_physics_%s.pkl' % ddg_method)
    naturalMutEdgeFile = cytoscapeDir / (
        'nondiseaseMut_perturbed_edges_physics_%s' % ddg_method)
    naturalMutNodeFile = cytoscapeDir / (
        'nondiseaseMut_node_colors_physics_%s' % ddg_method)
    diseaseMutEdgeFile = cytoscapeDir / (
        'diseaseMut_perturbed_edges_physics_%s' % ddg_method)
    diseaseMutNodeFile = cytoscapeDir / ('diseaseMut_node_colors_physics_%s' %
                                         ddg_method)

    # create output directories if not existing
    if not interactomeDir.exists():
        os.makedirs(interactomeDir)
    if not cytoscapeDir.exists():
        os.makedirs(cytoscapeDir)
    if not figDir.exists():
        os.makedirs(figDir)

    #------------------------------------------------------------------------------------
    # Fraction of mutation-targeted PPIs with ∆∆G exceeding a specified cutoff
    #------------------------------------------------------------------------------------

    # read change in binding free energy for interfacial mutations mapped on PDB chains
    naturalMutationsDDG = read_protein_mutation_ddg(natMutDDGinFile, 'binding')
    diseaseMutationsDDG = read_protein_mutation_ddg(disMutDDGinFile, 'binding')

    naturalMutations = pd.DataFrame(columns=[
        "protein", "partner", "protein_pos", "mut_res", "pdb_id", "chain_id",
        "chain_partner", "chain_mut", "ddg"
    ])
    for i, item in enumerate(naturalMutationsDDG.items()):
        naturalMutations.loc[i] = item[0] + item[1]

    diseaseMutations = pd.DataFrame(columns=[
        "protein", "partner", "protein_pos", "mut_res", "pdb_id", "chain_id",
        "chain_partner", "chain_mut", "ddg"
    ])
    for i, item in enumerate(diseaseMutationsDDG.items()):
        diseaseMutations.loc[i] = item[0] + item[1]

    naturalMutations.to_csv(natMutDDGoutFile, index=False, sep='\t')
    diseaseMutations.to_csv(disMutDDGoutFile, index=False, sep='\t')

    natMutDDGs = naturalMutations["ddg"].values
    disMutDDGs = diseaseMutations["ddg"].values

    numNatural_ddg_considered = len(natMutDDGs)
    numDisease_ddg_considered = len(disMutDDGs)

    print('\n' +
          'Avg. change in binding energy (∆∆G) for mutation-targeted PPIs:')
    print(
        'Non-disease: %.1f (SE = %g, n = %d)' %
        (np.mean(natMutDDGs), sderror(natMutDDGs), numNatural_ddg_considered))

    print(
        'Disease: %.1f (SE = %g, n = %d)' %
        (np.mean(disMutDDGs), sderror(disMutDDGs), numDisease_ddg_considered))
    # Statistical significance of difference in means
    t_test(natMutDDGs, disMutDDGs)

    multi_histogram_plot([disMutDDGs, natMutDDGs], ['red', 'green'],
                         xlabel='Change in binding free energy (∆∆G)',
                         ylabel='Number of mutations',
                         leg=[
                             'Disease interfacial mutations',
                             'Non-disease interfacial mutations'
                         ],
                         bins=25,
                         alpha=0.7,
                         fontsize=24,
                         show=showFigs,
                         figdir=figDir,
                         figname='mut_ddg_histogram_%s' % ddg_method)

    numNatural_ddg = sum(natMutDDGs > ddgCutoff)
    numDisease_ddg = sum(disMutDDGs > ddgCutoff)
    fracNatural_ddg = numNatural_ddg / numNatural_ddg_considered
    fracDisease_ddg = numDisease_ddg / numDisease_ddg_considered
    fracNatural_ddg_error = sderror_on_fraction(numNatural_ddg,
                                                numNatural_ddg_considered)
    fracDisease_ddg_error = sderror_on_fraction(numDisease_ddg,
                                                numDisease_ddg_considered)

    print('\n' +
          'Fraction of mutation-targeted PPIs with ∆∆G > %.1f kcal/mol:' %
          ddgCutoff)
    print('Non-disease: %.3f (SE = %g, ntot = %d)' %
          (fracNatural_ddg, fracNatural_ddg_error, numNatural_ddg_considered))

    print('Disease: %.3f (SE = %g, ntot = %d)' %
          (fracDisease_ddg, fracDisease_ddg_error, numDisease_ddg_considered))

    # Statistical significance of difference in fractions
    fisher_test([numNatural_ddg, numNatural_ddg_considered - numNatural_ddg],
                [numDisease_ddg, numDisease_ddg_considered - numDisease_ddg])

    bar_plot([fracNatural_ddg, fracDisease_ddg],
             error=[fracNatural_ddg_error, fracDisease_ddg_error],
             xlabels=('%s PPIs\nwith non-disease\nmutations\nat interface' %
                      struc_name[interactome_name],
                      '%s PPIs\nwith disease\nmutations\nat interface' %
                      struc_name[interactome_name]),
             ylabel=('Fraction with ∆∆G > %.1f kcal / mol' % ddgCutoff),
             ylabels=[0, 0.2, 0.4, 0.6, 0.8],
             ylim=[0, 0.8],
             colors=['turquoise', 'magenta'],
             edgecolor='black',
             ewidth=2.5,
             barwidth=0.6,
             fontsize=24,
             capsize=10,
             msize=26,
             show=showFigs,
             figdir=figDir,
             figname='mut_ddg_frac_>%.1f_%s' % (ddgCutoff, ddg_method))

    #------------------------------------------------------------------------------------
    # predict PPI perturbations
    #------------------------------------------------------------------------------------

    if geometryPerturbsFile.is_file():
        print('\n' + 'Loading geometry-based PPI perturbation predictions')
        with open(geometryPerturbsFile, 'rb') as f:
            naturalPerturbs, diseasePerturbs = pickle.load(f)
    else:
        print('\n' +
              'Geometry-based PPI perturbation prediction file not found')
        return

    print(
        '\n' +
        'Performing physics-based edgotype prediction for non-disease mutations'
    )
    naturalPerturbs[
        "perturbations"], knownDDG, unknownDDG = energy_based_perturbation(
            naturalPerturbs, naturalMutationsDDG, ddgCutoff)
    print('\n' +
          'Performing physics-based edgotype prediction for disease mutations')
    diseasePerturbs[
        "perturbations"], knownDDG, unknownDDG = energy_based_perturbation(
            diseasePerturbs, diseaseMutationsDDG, ddgCutoff)
    with open(physicsPerturbsFile, 'wb') as fOut:
        pickle.dump([naturalPerturbs, diseasePerturbs], fOut)

    #------------------------------------------------------------------------------------
    # plot network perturbations
    #------------------------------------------------------------------------------------

    if plot_perturbations:
        structuralInteractome = read_interface_annotated_interactome(
            interfaceAnnotatedInteractomeFile)

        print('\n' + 'Creating network perturbed by non-disease mutations')
        nodes, edges, nodeColors, edgeColors = create_perturbed_network(
            structuralInteractome, naturalPerturbs, naturalMutEdgeFile,
            naturalMutNodeFile)
        network_plot(
            edges,
            nodes=nodes,
            nodeSizes=[20] * len(nodes),
            edgeWidth=1,
            nodeColors=nodeColors,
            edgeColors=edgeColors,
            show=showFigs,
            figdir=figDir,
            figname='nondisease_mut_perturbed_interactome_physics_%s' %
            ddg_method)

        print('\n' + 'Creating network perturbed by disease mutations')
        nodes, edges, nodeColors, edgeColors = create_perturbed_network(
            structuralInteractome, diseasePerturbs, diseaseMutEdgeFile,
            diseaseMutNodeFile)
        network_plot(edges,
                     nodes=nodes,
                     nodeSizes=[20] * len(nodes),
                     edgeWidth=1,
                     nodeColors=nodeColors,
                     edgeColors=edgeColors,
                     show=showFigs,
                     figdir=figDir,
                     figname='disease_mut_perturbed_interactome_physics_%s' %
                     ddg_method)