Esempio n. 1
0
def cluster_domains(domain_occurrences,node_types,subsmat,args):
    '''
    A class that finds pairwise edit distances between domains and clusters them, does so for each domain length.
    Returns a domain-seed keyed dictionary of domain lists (clusters).
    '''
    # Set the minimum size at which domains should be clustered
    min_domain_size = 7 # maybe should be 8
    print("Clustering domains")
    # Set up pairwise alignment args
    domain_args = {'f':None,'f2':None,'a':None,'subsmat':subsmat,'gap':-1,'gapopen':0,'matrix':None,'custom':None,'o':None,'n':args['n'],'node_types':None}
    input_state = InputWrapperState(domain_args)
    input_state.subsmat = subsmat

    domains = list([domain for domain in domain_occurrences.keys()])

    # Cluster results via average distance criterion
    #linkages = cluster.hierarchy.average(distance_mat)

    ### Split domains up by size first, then cluster those that are large enough
    # Determine max length
    print("Domains: "+str(domains))
    min_length = min(len(domain) for domain in domains)
    max_length = max(len(domain) for domain in domains)

    clusters_by_length = {}

    # Move short domains into their own clusters
    for length in range(min_length,min_domain_size):
        domains_sub = list([domain for domain in domains if len(domain) == length])
        clusters = {}
        for domain in domains_sub:
            clusters[domain] = [domain]
        clusters_by_length[length] = clusters

    # Cluster domains that are long enough for each size
    for length in range(min_domain_size,max_length+1):
        domains_sub = list([domain for domain in domains if len(domain) == length])

        #domains_ns = list([NeuriteSequence("D"+str(i),domains[i]) for i in range(len(domains))])
        domains_ns = list([NeuriteSequence(domains_sub[i],domains_sub[i]) for i in range(len(domains_sub))])
        domain_id_map = {domains_sub[i]:i for i in range(len(domains_sub))}
        driver = PairwiseDriver(domains_ns, domains_ns, input_state, store_pairwise=True, score_type='num_gaps')
        driver.start()
        distance_mat = driver.get_score_matrix()

        '''
        Determine clusters from hierarchy ???
        Use abundance sort and then UCLUST 
        - First and subsequent sequences are seeds unless they are close enough to an existing seed, 
          in which case they are merged with the seed's cluster.
        - Domains shorter than min_domain_size go in their own separate clusters.
        - Domains that are exact sub- or super-sequences of a domain already in a cluster are excluded
        '''
        # First sort domains by occurrence frequency, requires creating tuples from domain and occurrence count
        domain_tuples = []
        for domain in domains_sub:
            domain_tuples.append((domain,domain_occurrences[domain]))
        sorted_tuples = sorted(domain_tuples, key=lambda domain_tup: (domain_tup[1],len(domain_tup[0])), reverse=True)
        sorted_domains = list([tup[0] for tup in sorted_tuples])
    
        if len(sorted_domains) > 0:
            # UCLUST implementation
            separates = {}
            clusters = {sorted_domains[0]:list([sorted_domains[0]])}
            for domain in sorted_domains[1:]:
                domain_pos = domain_id_map[domain]
                closest_cluster = -1,1000
                if len(domain) < min_domain_size:
                    separates[domain] = list([domain])
                else:
                    for seed in clusters:
                        seed_pos = domain_id_map[seed]
                        dist = max(distance_mat[domain_pos][seed_pos],distance_mat[seed_pos][domain_pos])
                        if dist <= max_cluster_dist and dist < closest_cluster[1]:
                            closest_cluster = seed,dist

                    if closest_cluster[0] == -1:
                        # If not close enough to any existing seed, add new cluster
                        clusters[domain] = list([domain])
                    else:
                        # Otherwise add it to the cluster it's closest to
                        clusters[closest_cluster[0]].append(domain)
           
            clusters_by_length[length] = clusters

    return clusters_by_length
Esempio n. 2
0
def run_local(targets, queries, input_state):
    driver = PairwiseDriver(targets, queries, input_state)
    driver.start() # start only pairwise alignment