コード例 #1
0
 def load_fps(strucfile):
     fp_fname = os.path.splitext(strucfile)[0] + "_%s_new.fps" % fpname
     if os.path.isfile(fp_fname):
         #print "%s exists!" % fp_fname
         return chemfp.load_fingerprints(fp_fname)
     else:
         fparena = chemfp.load_fingerprints(
             chemfp.read_structure_fingerprints(fptype,
                                                source=strucfile))
         fparena.save(fp_fname)
         print "%s saved!" % fp_fname
         return fparena
コード例 #2
0
 def test_arena_copy(self):
     data = "ABCD\t0\n" * 200
     from cStringIO import StringIO
     arena = chemfp.load_fingerprints(StringIO(data))
     def make_subarena_copy():
         arena[1:].copy()
     memory_growth(make_subarena_copy)
コード例 #3
0
 def test_1_alignment(self):
     a = chemfp.load_fingerprints(StringIO(zeros),
                                  reorder=True,
                                  alignment=1)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 0)
     self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x10")
コード例 #4
0
    def distance_matrix(self, t):

        fingerprints_file = self.compound_group.fp_file
        arena = chemfp.load_fingerprints(fingerprints_file)

        n = len(arena)

        # The Tanimoto search computes all of the scores when threshold=0.0.
        # The SearchResult contains sparse data, so I set all values
        # now to 1.0 so you can experiment with higher thresholds.
        distances = numpy.ones((n, n), numpy.float64)
        # Keep track of where the query subarena is in the query
        query_row = 0
        for query_arena in arena.iter_arenas():
            results = arena.threshold_tanimoto_search_arena(query_arena,
                                                            threshold=t)
            for q_i, hits in enumerate(results.iter_indices_and_scores()):
                query_idx = query_row + q_i
                for target_idx, score in hits:
                    distances[query_idx, target_idx] = 1 - score
            query_row += len(query_arena)

        #############################
        self.data.new_file()
        self.data.write(distances.tostring())
        self.data.close()
        self.save(validate=False, cascade=False)

        self.calc_knn_fp(arena)
コード例 #5
0
ファイル: models.py プロジェクト: mshbeab/Chemvis3D
    def distance_matrix(self,t):

        fingerprints_file = self.compound_group.fp_file
        arena = chemfp.load_fingerprints(fingerprints_file)

        n = len(arena)

        # The Tanimoto search computes all of the scores when threshold=0.0.
        # The SearchResult contains sparse data, so I set all values
        # now to 1.0 so you can experiment with higher thresholds.
        distances = numpy.ones((n, n), numpy.float64)
        # Keep track of where the query subarena is in the query
        query_row = 0
        for query_arena in arena.iter_arenas():
            results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t)  
            for q_i, hits in enumerate(results.iter_indices_and_scores()):
                query_idx = query_row + q_i
                for target_idx, score in hits:
                    distances[query_idx, target_idx] = 1 - score
            query_row += len(query_arena)



        #############################
        self.data.new_file()
        self.data.write(distances.tostring())
        self.data.close()
        self.save(validate=False,cascade=False)

        self.calc_knn_fp(arena)
コード例 #6
0
 def test_2_alignment(self):
     a = chemfp.load_fingerprints(StringIO(zeros),
                                  reorder=True,
                                  alignment=2)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 1)
     # The code overallocates one byte
     self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x10\x00")
コード例 #7
0
 def test_2_alignment(self):
     a = chemfp.load_fingerprints(StringIO(zeros),
                                  reorder=False,
                                  alignment=2)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 0)
     self.assertEquals(a.arena, "\x00\x00\x00\x10\x00\x00")
     self.assertEquals(a.storage_size, 2)
コード例 #8
0
def _load(fingerprints, reorder):
    if len(fingerprints) == 0:
        num_bits = 16
    else:
        num_bits = len(fingerprints[0])*8
    id_fps = ((str(i), fp) for (i, fp) in enumerate(fingerprints))
    return chemfp.load_fingerprints(id_fps,
                                    metadata=chemfp.Metadata(num_bits=num_bits),
                                    reorder=reorder, alignment=1)
コード例 #9
0
def get_coordinate(request):
   
    if request.is_ajax():
        # pdb.set_trace()
        mdsres = MDSRes.objects().with_id(request.session.get("mds_id"))
        Clusters = []
        compound_group = CompoundCollection.objects().with_id(request.session.get("collection_id"))
        if(mdsres is not None) :
            data = numpy.fromstring(mdsres.data.read())
            d = len(data)
            datamd = numpy.reshape(data, (d/3,3))
            compounds = Compound.objects(compound_group = request.session.get("collection_id"))
            if mdsres.simmatrix.method == "FP" :
                arena = chemfp.load_fingerprints(mdsres.simmatrix.compound_group.fp_file)
                compounds = arena.ids
            elif mdsres.simmatrix.method == "eden" :
                compounds = [compound.name for compound in compounds]
            elif mdsres.simmatrix.method == "pre_cluster_eden" or mdsres.simmatrix.method == "pre_cluster_eden_pca" :
                Clusters = Cluster.objects(collection = request.session.get("collection_id"))
                Clusters = [{"nodes" : [node.name for node  in cluster.nodes] ,"id":str(cluster.id) ,"centriod" : cluster.centriod.name ,"density" :len(cluster.nodes)} for cluster in Clusters]
                compounds = Cluster.get_clusters_centriod(compound_group)
                compounds = [compound.name for compound in compounds]

        elif(mdsres is None) :
            pcares = PCARes.objects().with_id(request.session.get("mds_id"))
            data = numpy.fromstring(pcares.data.read())
            d = len(data)
            datamd = numpy.reshape(data, (d/3,3))
            Clusters = Cluster.objects(collection = request.session.get("collection_id"))
            Clusters = [{"nodes" : [node.name for node  in cluster.nodes] ,"id":str(cluster.id) ,"centriod" : cluster.centriod.name ,"density" :len(cluster.nodes)} for cluster in Clusters]
            compounds = Cluster.get_clusters_centriod(compound_group)
            compounds = [compound.name for compound in compounds]
        # the order of the compounds is not correct !!! make sure to fix (arena.ids[idx] from the fingerprint file)




        response_data = {}
        response_data['result'] = 'Success'
        response_data['message'] = 'Compounds Coordinate'
        response_data['coord'] = datamd.tolist()
        response_data["clusters"] = Clusters
        response_data['comps'] = []
        for compound in compounds :
            compitem = {
                "name" :compound
            }
            response_data['comps'].append(compitem)

        return HttpResponse(json.dumps(response_data),  mimetype="application/json")
        
    return HttpResponse({}, mimetype="application/json")
コード例 #10
0
def main(args=None):
    args = parser.parse_args(args)

    if args.profile and psutil is None:
        sys.stderr.write(
            "WARNING: Must install the 'psutil' module to see memory statistics.\n"
        )

    # Load the fingerprints
    start_stats = get_profile_stats()
    try:
        arena = chemfp.load_fingerprints(args.fingerprint_filename)
    except IOError as err:
        sys.stderr.write("Cannot open fingerprint file: %s" % (err, ))
        raise SystemExit(2)

    # Make sure I can generate output before doing the heavy calculations
    outfile, outfile_close = open_output(parser, args.output)

    try:
        load_stats = get_profile_stats()

        # Generate the NxN similarity matrix for the given threshold
        similarity_table = search.threshold_tanimoto_search_symmetric(
            arena, threshold=args.threshold)
        similarity_stats = get_profile_stats()

        # Do the clustering
        cluster_results = taylor_butina_cluster(similarity_table)
        cluster_stats = get_profile_stats()

        # Report the results
        report_cluster_results(cluster_results, arena, outfile)

        # Report the time and memory use.
        if args.profile:
            print("#fingerprints:",
                  len(arena),
                  "#bits/fp:",
                  arena.num_bits,
                  "threshold:",
                  args.threshold,
                  "#matches:",
                  similarity_table.count_all(),
                  file=sys.stderr)
            profile_report("Load", start_stats, load_stats)
            profile_report("Similarity", load_stats, similarity_stats)
            profile_report("Clustering", similarity_stats, cluster_stats)
            profile_report("Total", start_stats, get_profile_time())
    finally:
        outfile_close()
コード例 #11
0
    def test_16_alignment(self):
        arenas = [chemfp.load_fingerprints(StringIO(ordered_zeros), reorder=True, alignment=16)
                  for i in range(10)]
        for a in arenas:
            if a.start_padding == 0 and a.end_padding == 0:
                s = a.arena
            else:
                self.assertEquals(a.start_padding + a.end_padding + 1, 16)
                self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding)
                self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding)
                s = a.arena[a.start_padding:-a.end_padding]

            self.assertEquals(s, ("\x00"*16 +
                                  "\x00"*16 +
                                  "\x00\x10" + "\x00"*14))
コード例 #12
0
def view_mds_result(request):
    comps = Compound.objects
    mdsres = MDSRes.objects()
    data = numpy.fromstring(mdsres[0].data.read())
    d = len(data)
    datamd = numpy.reshape(data, (d / 3, 3))
    arena = chemfp.load_fingerprints(mdsres[0].simmatrix.fp)
    alldata = zip(datamd.tolist(), arena.ids)

    return render_to_response('view_mds.html', {
        "view_titel": "View MDS Result",
        "data": alldata,
        "Compounds": comps
    },
                              context_instance=RequestContext(request))
コード例 #13
0
 def test_8_alignment(self):
     arenas = [chemfp.load_fingerprints(StringIO(zeros), reorder=False, alignment=8)
               for i in range(10)]
     for a in arenas:
         if a.start_padding == a.end_padding == 0:
             s = a.arena
         else:
             self.assertEquals(a.arena[:a.start_padding], "\x00" * a.start_padding)
             self.assertEquals(a.arena[-a.end_padding:], "\x00" * a.end_padding)
             s = a.arena[a.start_padding:-a.end_padding]
         
         self.assertEquals(s,
                           "\x00\x00\x00\x00\x00\x00\x00\x00"
                           "\x00\x10\x00\x00\x00\x00\x00\x00"
                           "\x00\x00\x00\x00\x00\x00\x00\x00")
         self.assertEquals(a.storage_size, 8)
コード例 #14
0
def make_fingerprint_arena(
    mols: Union[Molmap, Mapping[str, str]],
    fingerprint_type: str = "morgan",
    fingerprint_args: Mapping[str, Any] = {},
) -> chemfp.arena.FingerprintArena:
    fp_maker = chemfp_fingerprint_functions[fingerprint_type](
        fingerprint_args).make_fingerprinter()
    if isinstance(next(iter(mols.values())), Mol):
        fp_generator = ((str(n), fp_maker(m)) for n, m in mols.items())
        fp = fp_maker(next(iter(mols.values())))
    else:
        fp_generator = ((str(n), fp.encode()) for n, fp in mols.items())
        fp = next(iter(mols.values()))
    arena = chemfp.load_fingerprints(
        fp_generator, metadata=chemfp.Metadata(num_bits=len(fp) * 8))
    return arena
コード例 #15
0
def search_pubchem(MACCS_bit, thr):
    """
    search
    """
    out = []
    converted = chemfp.encodings.from_binary_lsb(MACCS_bit)
    for i in range(5323):
        a = "./fps/Compound_" + str(i * 25000 + 1).zfill(9) + '_' + str(
            (i + 1) * 25000).zfill(9) + ".fps"
        try:
            arena = chemfp.load_fingerprints(a, reorder=False, format="fps")
            out.extend(
                chemfp.search.threshold_tanimoto_search_fp(
                    converted[1], arena, thr).get_ids_and_scores())
        except:
            i = i
            #print "No such file or directory: " + a
    return out
コード例 #16
0
    def test_16_alignment(self):
        arenas = [
            chemfp.load_fingerprints(StringIO(ordered_zeros),
                                     reorder=True,
                                     alignment=16) for i in range(10)
        ]
        for a in arenas:
            if a.start_padding == 0 and a.end_padding == 0:
                s = a.arena
            else:
                self.assertEquals(a.start_padding + a.end_padding + 1, 16)
                self.assertEquals(a.arena[:a.start_padding],
                                  "\x00" * a.start_padding)
                self.assertEquals(a.arena[-a.end_padding:],
                                  "\x00" * a.end_padding)
                s = a.arena[a.start_padding:-a.end_padding]

            self.assertEquals(
                s, ("\x00" * 16 + "\x00" * 16 + "\x00\x10" + "\x00" * 14))
コード例 #17
0
def smidf2arena(smidf, reorder=True):

    # Write df of smiles, id
    smidf.to_csv('smidf.smi', header=False, sep=' ', index=False)

    # Generate fps file
    sp.call(['rdkit2fps', './smidf.smi', '-o', 'smidf.fps'])

    ## Load the FPs into an arena
    try:
        arena = chemfp.load_fingerprints('./smidf.fps', reorder=reorder)
    except IOError as err:
        sys.stderr.write("Cannot open fingerprint file: %s" % (err, ))
        raise SystemExit(2)

    # Remove files
    sp.call(['rm', './smidf.smi', './smidf.fps'])

    # Return arena
    return arena
コード例 #18
0
    def test_8_alignment(self):
        arenas = [
            chemfp.load_fingerprints(StringIO(zeros),
                                     reorder=False,
                                     alignment=8) for i in range(10)
        ]
        for a in arenas:
            if a.start_padding == a.end_padding == 0:
                s = a.arena
            else:
                self.assertEquals(a.arena[:a.start_padding],
                                  "\x00" * a.start_padding)
                self.assertEquals(a.arena[-a.end_padding:],
                                  "\x00" * a.end_padding)
                s = a.arena[a.start_padding:-a.end_padding]

            self.assertEquals(
                s, "\x00\x00\x00\x00\x00\x00\x00\x00"
                "\x00\x10\x00\x00\x00\x00\x00\x00"
                "\x00\x00\x00\x00\x00\x00\x00\x00")
            self.assertEquals(a.storage_size, 8)
コード例 #19
0
 def test_2_alignment(self):
     a = chemfp.load_fingerprints(StringIO(zeros), reorder=True, alignment=2)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 1)
     # The code overallocates one byte
     self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x10\x00")
コード例 #20
0
ファイル: test_methods.py プロジェクト: flc/chem-fingerprints
from __future__ import absolute_import, with_statement
import unittest2

from support import fullpath

import chemfp
import chemfp.bitops
import _chemfp

set_alignment_method = chemfp.bitops.set_alignment_method
get_alignment_method = chemfp.bitops.get_alignment_method

CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps")
CHEBI_QUERIES = fullpath("chebi_queries.fps.gz")

targets = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=8)
targets_64 = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=64)

available_methods = chemfp.bitops.get_methods()
alignment_methods = chemfp.bitops.get_alignment_methods()


all_methods = dict.fromkeys("LUT8-1 LUT8-4 LUT16-4 Lauradoux POPCNT Gillies ssse3".split())

class TestMethods(unittest2.TestCase):
    def test_no_duplicates(self):
        methods = chemfp.bitops.get_methods()
        self.assertEquals(len(methods), len(set(methods)))

    def test_for_unknown_methods(self):
        for method in chemfp.bitops.get_methods():
コード例 #21
0
 def test_2_alignment(self):
     a = chemfp.load_fingerprints(StringIO(zeros), reorder=False, alignment=2)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 0)
     self.assertEquals(a.arena, "\x00\x00\x00\x10\x00\x00")
     self.assertEquals(a.storage_size, 2)
コード例 #22
0
def read_chemfp(input_file):
    reader = chemfp.read_molecule_fingerprints("RDKit-Morgan fpSize=1024", input_file)
    arena_all = chemfp.load_fingerprints(reader, reorder=False)
    return arena_all
コード例 #23
0
    parser.add_argument("-c", "--cluster", dest="cluster_image",
                    help="Path to the output cluster image.")

    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
                    help="Path to the similarity matrix output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")

    parser.add_argument('-p', '--processors', type=int, 
        default=4)

    args = parser.parse_args()

    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )
    distances  = distance_matrix( arena, args.tanimoto_threshold )

    if args.similarity_matrix:
        numpy.savetxt(args.similarity_matrix, distances)

    if args.cluster_image:
        linkage = hcluster.linkage(distances, method="single", metric="euclidean")
        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.)
        pylab.savefig(args.cluster_image, format=args.oformat)

コード例 #24
0
def butina( args ):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )

    chemfp.set_num_threads( args.processors )
    results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append( fp_idx )
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append( (fp_idx, unassigned) )
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write( "#%s true singletons\n" % len(true_singletons) )
    out.write( "#%s false singletons\n" % len(false_singletons) )
    out.write( "#clusters: %s\n" % len_cluster )

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
コード例 #25
0
ファイル: test_api.py プロジェクト: flc/chem-fingerprints
 def _open(self, name):
     return chemfp.load_fingerprints(name, reorder=True)
コード例 #26
0
def butina(args):
    """
        Taylor-Butina clustering from the chemfp help.
    """
    out = args.output_path
    targets = chemfp.open(args.input_path, format='fps')
    arena = chemfp.load_fingerprints(targets)

    chemfp.set_num_threads(args.processors)
    results = search.threshold_tanimoto_search_symmetric(
        arena, threshold=args.tanimoto_threshold)
    results.reorder_all("move-closest-first")

    sorted_ids = unix_sort(results)

    # Determine the true/false singletons and the clusters
    true_singletons = []
    false_singletons = []
    clusters = []

    seen = set()
    #for (size, fp_idx, members) in results:
    for (size, fp_idx) in sorted_ids:
        members = results[fp_idx].get_indices()
        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
        if fp_idx in seen:
            # Can't use a centroid which is already assigned
            continue
        seen.add(fp_idx)

        if size == 0:
            # The only fingerprint in the exclusion sphere is itself
            true_singletons.append(fp_idx)
            continue

        # Figure out which ones haven't yet been assigned
        unassigned = set(members) - seen

        if not unassigned:
            false_singletons.append(fp_idx)
            continue

        # this is a new cluster
        clusters.append((fp_idx, unassigned))
        seen.update(unassigned)

    len_cluster = len(clusters)
    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

    out.write("#%s true singletons\n" % len(true_singletons))
    out.write("#%s false singletons\n" % len(false_singletons))
    out.write("#clusters: %s\n" % len_cluster)

    # Sort so the cluster with the most compounds comes first,
    # then by alphabetically smallest id
    def cluster_sort_key(cluster):
        centroid_idx, members = cluster
        return -len(members), arena.ids[centroid_idx]

    clusters.sort(key=cluster_sort_key)

    for centroid_idx, members in clusters:
        centroid_name = arena.ids[centroid_idx]
        out.write("%s\t%s\t%s\n" %
                  (centroid_name, len(members), " ".join(arena.ids[idx]
                                                         for idx in members)))
        #ToDo: len(members) need to be some biggest top 90% or something ...

    for idx in true_singletons:
        out.write("%s\t%s\n" % (arena.ids[idx], 0))

    out.close()
コード例 #27
0
# Test the symmetric code

import unittest2
from cStringIO import StringIO
import array

import chemfp
from chemfp import search, bitops

from support import fullpath, PUBCHEM_SDF, PUBCHEM_SDF_GZ

fps = chemfp.load_fingerprints(fullpath("queries.fps"))

zeros = chemfp.load_fingerprints(StringIO("""\
0000\tA
0000\tB
0001\tC
0002\tD
FFFE\tE
FFFF\tF
"""))

def slow_counts(counts, fps, threshold,
                query_start, query_end,
                target_start, target_end):
    N = len(fps)
    query_end = min(N, query_end)
    target_end = min(N, target_end)

    for row in range(query_start, query_end):
        row_fp = fps[row][1]
コード例 #28
0
 def test_4_alignment(self):
     a = chemfp.load_fingerprints(StringIO(ordered_zeros), reorder=True, alignment=4)
     self.assertEquals(a.start_padding, 0)
     self.assertEquals(a.end_padding, 0)
     self.assertEquals(a.arena, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00")
コード例 #29
0
# An implementation of Taylor-Butina clustering

# See http://www.chemomine.co.uk/dbclus-paper.pdf
# and http://www.redbrick.dcu.ie/~noel/R_clustering.html

import chemfp

THRESHOLD = 0.80

dataset = chemfp.load_fingerprints("docs/pubchem_targets.fps")
print "Clustering", len(dataset), "fingerprints"

# I'll make a list with tuples containing:
#   - the number of hits
#   - an arbitrary and not very good tie-breaker value (larger values go first)
#   - the fingerprint index
#   - the list of fingerprint indices within THRESHOLD of that fingerprint


def tie_breaker_value(hits):
    # This is pretty arbitrary; it's the largest non-1.0 score; or 1.0
    # Noel references better work on tie breaking by John MacCuish at Mesa Analytics
    try:
        tie_breaker = max(score for (idx, score) in hits if score != 1.0)
    except ValueError:
        tie_breaker = 1.0
    return tie_breaker


def hit_members(hits):
    return [idx for (idx, score) in hits]
コード例 #30
0
# An implementation of Taylor-Butina clustering

# See http://www.chemomine.co.uk/dbclus-paper.pdf
# and http://www.redbrick.dcu.ie/~noel/R_clustering.html

import chemfp

THRESHOLD = 0.80

dataset = chemfp.load_fingerprints("docs/pubchem_targets.fps")
print "Clustering", len(dataset), "fingerprints"

# I'll make a list with tuples containing:
#   - the number of hits
#   - an arbitrary and not very good tie-breaker value (larger values go first)
#   - the fingerprint index
#   - the list of fingerprint indices within THRESHOLD of that fingerprint

def tie_breaker_value(hits):
    # This is pretty arbitrary; it's the largest non-1.0 score; or 1.0
    # Noel references better work on tie breaking by John MacCuish at Mesa Analytics
    try:
        tie_breaker = max(score for (idx, score) in hits if score != 1.0)
    except ValueError:
        tie_breaker = 1.0
    return tie_breaker

def hit_members(hits):
    return [idx for (idx, score) in hits]

# Assign the compound index to its hits
コード例 #31
0
            raise SystemExit(1)

    batch_size = args.batch_size
    query_arena_iter = queries.iter_arenas(batch_size)
    
    t1 = time.time()

    first_query_arena = None
    for first_query_arena in query_arena_iter:
        break

    if args.scan:
        # Leave the targets as-is
        pass
    elif args.memory:
        targets = chemfp.load_fingerprints(targets)
    if not first_query_arena:
        # No input. Leave as-is
        pass
    elif len(first_query_arena) < min(10, batch_size):
        # Figure out the optimal search. If there is a
        # small number of inputs (< ~10) then a scan
        # of the FPS file is faster than an arena search.
        pass
    else:
        targets = chemfp.load_fingerprints(targets)

    problems = chemfp.check_metadata_problems(queries.metadata, targets.metadata)
    for (severity, error, msg_template) in problems:
        msg = msg_template % dict(metadata1="queries", metadata2="targets")
        if severity == "error":
コード例 #32
0
# Test the symmetric code

import unittest2
from cStringIO import StringIO
import array

import chemfp
from chemfp import search, bitops

from support import fullpath, PUBCHEM_SDF, PUBCHEM_SDF_GZ

fps = chemfp.load_fingerprints(fullpath("queries.fps"))

zeros = chemfp.load_fingerprints(
    StringIO("""\
0000\tA
0000\tB
0001\tC
0002\tD
FFFE\tE
FFFF\tF
"""))


def slow_counts(counts, fps, threshold, query_start, query_end, target_start,
                target_end):
    N = len(fps)
    query_end = min(N, query_end)
    target_end = min(N, target_end)

    for row in range(query_start, query_end):
コード例 #33
0
import unittest2
from cStringIO import StringIO

from support import fullpath

import chemfp
import chemfp.bitops
import _chemfp

set_alignment_method = chemfp.bitops.set_alignment_method
get_alignment_method = chemfp.bitops.get_alignment_method

CHEBI_TARGETS = fullpath("chebi_rdmaccs.fps")
CHEBI_QUERIES = fullpath("chebi_queries.fps.gz")

targets = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=8)
targets_64 = chemfp.load_fingerprints(CHEBI_TARGETS, alignment=64)

available_methods = chemfp.bitops.get_methods()
alignment_methods = chemfp.bitops.get_alignment_methods()


all_methods = dict.fromkeys("LUT8-1 LUT8-4 LUT16-4 Lauradoux POPCNT Gillies ssse3".split())

class TestMethods(unittest2.TestCase):
    def test_no_duplicates(self):
        methods = chemfp.bitops.get_methods()
        self.assertEquals(len(methods), len(set(methods)))

    def test_for_unknown_methods(self):
        for method in chemfp.bitops.get_methods():
コード例 #34
0
temp_link = "%s.%s" % (temp_file.name, 'fps')
temp_file.close()
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) )


chemfp_fingerprint_file = temp_link
tanimoto_threshold = float(sys.argv[2])
outfile = sys.argv[3]
processors = int(sys.argv[4])


def get_hit_indicies(hits):
    return [id for (id, score) in hits]

out = open(outfile, 'w')
dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )

chemfp.set_num_threads( processors )
search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
#search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)

# Reorder so the centroid with the most hits comes first.
# (That's why I do a reverse search.)
# Ignore the arbitrariness of breaking ties by fingerprint index
results = sorted( (  (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())  ),reverse=True)


# Determine the true/false singletons and the clusters
true_singletons = []
false_singletons = []
clusters = []
コード例 #35
0
            raise SystemExit(1)

    batch_size = args.batch_size
    query_arena_iter = queries.iter_arenas(batch_size)

    t1 = time.time()

    first_query_arena = None
    for first_query_arena in query_arena_iter:
        break

    if args.scan:
        # Leave the targets as-is
        pass
    elif args.memory:
        targets = chemfp.load_fingerprints(targets)
    if not first_query_arena:
        # No input. Leave as-is
        pass
    elif len(first_query_arena) < min(10, batch_size):
        # Figure out the optimal search. If there is a
        # small number of inputs (< ~10) then a scan
        # of the FPS file is faster than an arena search.
        pass
    else:
        targets = chemfp.load_fingerprints(targets)

    problems = chemfp.check_metadata_problems(queries.metadata,
                                              targets.metadata)
    for (severity, error, msg_template) in problems:
        msg = msg_template % dict(metadata1="queries", metadata2="targets")
コード例 #36
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}
    
    original_ids = []
    fps = chemfp.open(target_filename)
    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp
    
    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict((i, original_ids[original_index])
                                            for i, original_index in enumerate(targets.ids))
    
    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "targets": target_filename,
            "target_sources": targets.metadata.sources})

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(targets, threshold,
                                                          batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(targets, threshold,
                                                                     batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
                                                                    batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[current_index].get_ids_and_scores()
                outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n") # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))
コード例 #37
0
def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}

    original_ids = []
    fps = chemfp.open(target_filename)

    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp

    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(
        zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict(
        (i, original_ids[original_index])
        for i, original_index in enumerate(targets.ids))

    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(
            outfile, {
                "num_bits": targets.metadata.num_bits,
                "software": SOFTWARE,
                "type": type,
                "targets": target_filename,
                "target_sources": targets.metadata.sources
            })

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(
                targets, threshold, batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(
                targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(
                    targets, threshold, batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(
                    targets, k, threshold, batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[
                    current_index].get_ids_and_scores()
                outfile.write("%d\t%s" %
                              (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n")  # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" %
                         (t2 - t1, t3 - t2, t3 - t1))
コード例 #38
0
    parser.add_argument("-c", "--cluster", dest="cluster_image",
                    help="Path to the output cluster image.")

    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
                    help="Path to the similarity matrix output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")

    parser.add_argument('-p', '--processors', type=int, 
        default=4)

    args = parser.parse_args()

    targets = chemfp.open( args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )
    distances  = distance_matrix( arena, args.tanimoto_threshold )

    if args.similarity_matrix:
        numpy.savetxt(args.similarity_matrix, distances)

    if args.cluster_image:
        linkage = hcluster.linkage(distances, method="single", metric="euclidean")
        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.)
        pylab.savefig(args.cluster_image, format=args.oformat)

コード例 #39
0
temp_link = "%s.%s" % (temp_file.name, 'fps')
temp_file.close()
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link))

chemfp_fingerprint_file = temp_link
tanimoto_threshold = float(sys.argv[2])
outfile = sys.argv[3]
processors = int(sys.argv[4])


def get_hit_indicies(hits):
    return [id for (id, score) in hits]


out = open(outfile, 'w')
dataset = chemfp.load_fingerprints(chemfp_fingerprint_file)

chemfp.set_num_threads(processors)
search = dataset.threshold_tanimoto_search_arena(dataset,
                                                 threshold=tanimoto_threshold)
#search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)

# Reorder so the centroid with the most hits comes first.
# (That's why I do a reverse search.)
# Ignore the arbitrariness of breaking ties by fingerprint index
results = sorted(
    ((len(hits), i, hits)
     for (i, hits) in enumerate(search.iter_indices_and_scores())),
    reverse=True)

# Determine the true/false singletons and the clusters