Beispiel #1
0
def evaluate(activity_arr):
    auc = Scoring.CalcAUC(activity_arr, 0)
    print("AUC: ", auc)
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    print("EF for 1%: ", ef[0])
    ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    print("EF for 5%: ", ef[0])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    print("RIE for 100: ", rie)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    print("BEDROC for 100: ", bedroc)
def evaluation(activity_arr: list, output_file: str):
    inputoutput_utils.create_parent_directory(output_file)
    auc = Scoring.CalcAUC(activity_arr, 0)
    ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    output = {
        "AUC": auc,
        "EF1": ef1[0],
        "EF5": ef5[0],
        "RIE": rie,
        "BEDROC": bedroc
    }
    with open(output_file, "w", encoding="utf-8") as stream:
        json.dump(output, stream)
def evaluation(activity_arr: list, output_file: str):
    with open(output_file, "w") as stream:
        auc = Scoring.CalcAUC(activity_arr, 0)
        stream.write("AUC: ")
        stream.write(str(auc))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
        stream.write("\nEF for 1%: ")
        stream.write(str(ef[0]))
        ef = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
        stream.write("\nEF for 5%: ")
        stream.write(str(ef[0]))
        rie = Scoring.CalcRIE(activity_arr, 0, 100)
        stream.write("\nRIE for 100: ")
        stream.write(str(rie))
        bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
        stream.write("\nBEDROC for 100: ")
        stream.write(str(bedroc))
Beispiel #4
0
def metrics_for_target(pred, actual, mask):
    mask = np.array(mask, dtype=np.bool)
    masked_preds = pred.squeeze()[mask]
    order = np.flipud(np.argsort(masked_preds))
    masked_oredered_actual = actual[mask][order]
    return Scoring.CalcEnrichment(
        masked_oredered_actual, 0, [.001, .005, .01, .05]) + [
            Scoring.CalcAUC(masked_oredered_actual, 0),
            Scoring.CalcBEDROC(masked_oredered_actual, 0, 20)
        ]
Beispiel #5
0
 def test1(self):
     """ test enrichment factor """
     # best case
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # worst case
     enrich = Scoring.CalcEnrichment(self.scoreWorstCase, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], 0.0, self.acc)
     # empty list
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreEmptyList, self.index, self.fractions)
     # all actives
     enrich = Scoring.CalcEnrichment(self.scoreAllActives, self.index, self.fractions)
     self.assertAlmostEqual(enrich[0], 1.0, self.acc)
     # all decoys
     enrich = Scoring.CalcEnrichment(self.scoreAllDecoys, self.index, self.fractions)
     self.assertEqual(enrich[0], 0.0)
     # fraction * numMol is smaller than 1
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, self.fracSmall)
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # fraction list is empty
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [])
     # fraction == 0.0
     enrich = Scoring.CalcEnrichment(self.scoreBestCase, self.index, [0.0])
     self.assertAlmostEqual(enrich[0], float(self.numActives), self.acc)
     # fraction < 0
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [-0.05])
     # fraction > 1
     self.assertRaises(ValueError, Scoring.CalcEnrichment, self.scoreBestCase, self.index, [1.5])
 def calculate(self, score, index):
     return Scoring.CalcEnrichment(score, index, self.params)
Beispiel #7
0
    train_fps += [fps_inact[j] for j in train_indices_inact]
    ys_fit = [1] * len(train_indices_act) + [0] * len(train_indices_inact)
    # train the model
    ml = BernoulliNB()
    ml.fit(train_fps, ys_fit)

    # chemical similarity
    simil = cPickle.load(infile)

    # ranking
    test_fps = [fps_act[j] for j in test_indices_act[i]]
    test_fps += [fps_inact[j] for j in test_indices_inact[i]]
    scores = [[pp[1], s[0], s[1]]
              for pp, s in zip(ml.predict_proba(test_fps), simil)]

    # write ranks for actives
    cf.writeActiveRanks(scores, rankfile, num_actives)

    scores.sort(reverse=True)

    # evaluation
    auc = Scoring.CalcAUC(scores, -1)
    ef = Scoring.CalcEnrichment(scores, -1, [0.05])

    # write out
    outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0]))

infile.close()
rankfile.close()
outfile.close()
def screening(input_path, input_directory, ged_results_file, output_path=None):
    """Perform a virtual screening.

    :param input_path: input .json file with basic screening params
    :param input_directory: directory with .sdf files
    :param ged_results_file: .json file with GED results and parameters
    :param output_path:
    :return:
    """
    with open(input_path) as input_stream:
        input_data = json.load(input_stream)
    # Load molecules.
    logging.info('Loading molecules ...')
    molecules = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        if not os.path.exists(path):
            logging.error('Missing file: %s' % path)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path))
    # Parse ged results file
    with open(ged_results_file) as ged_stream:
        ged_data = json.load(ged_stream)
    # Create representation of active molecules.
    actives = []
    for active in input_data['data']['train']['ligands']:
        if active['name'] not in molecules:
            continue
        actives.append(molecules[active['name']])
    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        # Counting similarity and searching for most similar active molecule
        similarity = 0
        similarMol = query
        for active in actives:
            currentSimilarity = _ged_similarity(query, active, ged_data)
            if (currentSimilarity > similarity):
                similarity = currentSimilarity
                similarMol = active
        scores.append({
            'name': item['name'],
            'similarity': similarity,
            'activity': item['activity'],
            'most-similar-active': similarMol.GetProp("_Name")
        })
        #if (item['activity'] == 1) create_picture(query, similar-active)
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)
        counter += 1
        logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    # Evaluate screening.
    scores = sorted(scores, key=lambda m: m['similarity'], reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])
    # Print results.
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    total_time = float(ged_data["properties"]["time"]) / 1000
    total_time += (time_end - time_begin)
    print('Execution time : %.2fs' % total_time)
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump(
                {
                    'properties': ged_data["properties"],
                    'data': scores,
                    'metadata': {
                        'auc': auc,
                        'ef': {
                            '0.005': ef[0],
                            '0.01': ef[1],
                            '0.02': ef[2],
                            '0.05': ef[3]
                        },
                        'fileName': os.path.basename(__file__),
                        'executionTime': total_time,
                        'definition': {
                            'selection': input_data['info']['selection'],
                            'molecules': input_data['info']['molecules'],
                            'index': input_data['info']['index'],
                            'dataset': input_data['info']['dataset'],
                            'method': input_data['info']['method'],
                            'config': 'config_file'
                        }
                    }
                },
                output_stream,
                indent=2)
def screening(input_dir, input_directory, config_file, output_path=None):
    """Perform a virtual screening.

    :param input_dir: path to input data (training and test in .json)
    :param input_directory: path to sdf files
    :param config_file: configuration file of mcs
    :param output_path: directory to save the results
    :return:
    """
    
    with open(input_dir) as input_stream:
        input_data = json.load(input_stream)
    # Load molecules.
    logging.info('Loading molecules ...')
    molecules = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        if not os.path.exists(path):
            logging.error('Missing file: %s' % file_item)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path))
    # Create representation of active molecules.
    actives = []
    for active in input_data['data']['train']['ligands']:
        if active['name'] not in molecules:
            continue
        actives.append(molecules[active['name']])
    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    inexact = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    params = mcsutils._parse_config(config_file)
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        similarity = max([mcsutils._similarity(query, active, inexact, input_data['info'], params) for active in actives])
        scores.append({
            'name': item['name'],
            'similarity': similarity,
            'activity': item['activity']
        })
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)            
            #_flush_results(output_path, scores)
        counter += 1
        #logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    # Evaluate screening.
    scores = sorted(scores,
                    key=lambda m: m['similarity'],
                    reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])
    # Print results.
    print('Input file: ', input_dir)
    print('Difficulty: ', input_directory)
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    print('Execution time : %.2fs' % (time_end - time_begin))
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump({
                'data': scores,
                'metadata': {
                    'auc': auc,
                    'ef': {
                        '0.005': ef[0],
                        '0.01': ef[1],
                        '0.02': ef[2],
                        '0.05': ef[3]
                    },
                    'fileName': os.path.basename(__file__),
                    'executionTime': time_end - time_begin,
                    'inexactMolecules': inexact,
                    
                'definition': {
                        'selection': input_data['info']['selection'],
                        'molecules': input_data['info']['molecules'],
                        'index': input_data['info']['index'],
                        'dataset': input_data['info']['dataset'],
                        'method': input_data['info']['method'],
                        'config': 'config_file'
                    }
                }
            }, output_stream, indent=2)
Beispiel #10
0
    test_fps = [fps_act_morgan2[j] for j in test_indices_act[i]]
    test_fps += [fps_inact_morgan2[j] for j in test_indices_inact[i]]
    scores_rf_morgan2 = [[
        pp[1], s[0], s[1]
    ] for pp, s in zip(rf_morgan2.predict_proba(test_fps), simil)]

    # assign ranks
    scores_rf_rdk5 = cf.assignRanksWithInfo(scores_rf_rdk5)
    scores_lr_rdk5 = cf.assignRanksWithInfo(scores_lr_rdk5)
    scores_rf_morgan2 = cf.assignRanksWithInfo(scores_rf_morgan2)

    # fusion
    fusion_scores = []
    for m1, m2, m3 in zip(scores_rf_rdk5, scores_lr_rdk5, scores_rf_morgan2):
        rank = max([m1[0], m2[0], m3[0]])  # max. rank
        proba = max([m1[1], m2[1], m3[1]])  # max. rank
        # store: [max rank, max proba, simil, info]
        fusion_scores.append([rank, proba, m1[2], m1[3]])
    fusion_scores.sort(reverse=True)

    # evaluation
    auc = Scoring.CalcAUC(fusion_scores, -1)
    ef = Scoring.CalcEnrichment(fusion_scores, -1, [0.05])

    # write out
    outfile.write("%i\t%.10f\t%.10f\n" % (i, auc, ef[0]))

infile1.close()
infile2.close()
outfile.close()
Beispiel #11
0
def run_ted(input_path, input_directory, prop, output_path):
    """ Loads .sdf file, converts the molecules into trees with graph annotations, runs
    the TED, evaluates the results and saves them into a file.

    :param input_path:
    :param input_directory:
    :param output_path:
    :return:
    """
    with open(input_path) as input_stream:
        input_data = json.load(input_stream)

    # Load molecules and convert them to tree graphs.
    logging.info('Loading molecules ...')
    molecules = {}
    sizes = {}
    bondSizes = {}
    for file_item in input_data['files']:
        path = input_directory + file_item + '.sdf'
        logging.debug(path)
        if not os.path.exists(path):
            logging.error('Missing file: %s' % file_item)
            raise Exception('Missing file.')
        molecules.update(_load_molecules(path, sizes, bondSizes, prop))

    # Screening.
    logging.info('Screening ...')
    scores = []
    counter = 0
    counter_max = len(input_data['data']['test'])
    counter_step = math.floor(counter_max / 100.0) + 1
    time_begin = time.clock()
    for item in input_data['data']['test']:
        if item['name'] not in molecules:
            continue
        query = molecules[item['name']]
        query_size = sizes[item['name']]
        query_bonds = bondSizes[item['name']]
        # Count pairwise similarity with all actives and choose the maximum.
        maxsim = 0
        for active in input_data['data']['train']['ligands']:
            if active['name'] not in molecules:
                continue
            active_graph = molecules[active['name']]
            active_size = sizes[active['name']]
            active_bonds = bondSizes[active['name']]
            ted = _ted(query, active_graph, prop)
            sim = 1.00 - ted / float(query_size + active_size + query_bonds +
                                     active_bonds)
            if (sim > maxsim):
                maxsim = sim
                minted = ted
        scores.append({
            'name': item['name'],
            'similarity': maxsim,
            'activity': item['activity'],
            'ted': minted
        })
        if counter % counter_step == 0:
            logging.debug('%d/%d', counter, counter_max)
            _flush_results(output_path, scores)
        counter += 1
        logging.debug('counter: ' + str(counter))
    time_end = time.clock()
    logging.debug("Reached the end.")

    # Evaluate screening.
    scores = sorted(scores, key=lambda m: m['similarity'], reverse=True)
    auc = Scoring.CalcAUC(scores, 'activity')
    ef = Scoring.CalcEnrichment(scores, 'activity', [0.005, 0.01, 0.02, 0.05])

    # Print results.
    print('AUC : ', auc)
    print('EF (0.5%, 1.0%, 2.0%, 5.0%) : ', ef)
    print('Execution time : %.2fs' % (time_end - time_begin))
    # Write result to a file.
    if not output_path is None and not output_path == '':
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        with open(output_path, 'w') as output_stream:
            json.dump(
                {
                    'data': scores,
                    'metadata': {
                        'auc': auc,
                        'ef': {
                            '0.005': ef[0],
                            '0.01': ef[1],
                            '0.02': ef[2],
                            '0.05': ef[3]
                        },
                        'fileName': os.path.basename(__file__),
                        'executionTime': time_end - time_begin,
                        'definition': {
                            'selection': input_data['info']['selection'],
                            'molecules': input_data['info']['molecules'],
                            'index': input_data['info']['index'],
                            'dataset': input_data['info']['dataset'],
                            'method': input_data['info']['method'],
                            'config': 'config_file'
                        }
                    }
                },
                output_stream,
                indent=2)