Example #1
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.MDS_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
Example #2
0
    def __init__(self):
        self.nwalkers = 32
        self.ndim = 7
        filename = "tutorial.h5"
        backend = emcee.backends.HDFBackend(filename)
        backend.reset(self.nwalkers, self.ndim)

        hod_params = {"M_min": 0, "galaxy_density": 0.00057, "boxsize": 1000, "log_halo_mass_bins": np.arange(10,15,0.1), \
          "halo_histo": np.loadtxt("../data/halo_central_histo.dat")}
        halofile = "../../ELG_HOD_optimization/data/halo_M200b_0.54980_for_mock.dat"
        self.mockfactory = MockFactory(halofile,
                                       boxsize=1000,
                                       cvir_fac=1,
                                       hod_parameters=hod_params)

        # clustering calculator
        rbins = np.logspace(np.log10(0.1), np.log10(70), 21)
        self.cluster = Clustering(rbins)

        # read xi and wp from data, read cov matrix
        self.clustering_data = np.loadtxt("../data/clustering_data.dat")
        self.scaled_cov = np.loadtxt("../data/scaled_cov.dat")

        #with Pool(10) as pool:
        #	self.sampler = emcee.EnsembleSampler(self.nwalkers, self.ndim, self.log_prob, backend=backend, pool = pool)
        #	self.run()

        self.sampler = emcee.EnsembleSampler(self.nwalkers,
                                             self.ndim,
                                             self.log_prob,
                                             backend=backend)
        self.run()
Example #3
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.MDS_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
Example #4
0
def cluster_data(data, ov_min=10, ov_min1=0.2):
    cl = Clustering(ov_min, ov_min1)
    cl.fill_clusters(data['x1'], data['x2'])

    c_labels = cl.clabels
    nhits = len(c_labels)
    n_clust = cl.ncl
    x1cl = cl.cluster_x1
    x2cl = cl.cluster_x2
    ycl = []
    pcentcl = []
    namecl = []
    detailcl = []
    x1tcl = []
    x2tcl = []
    dxtcl = []
    for i in range(0,n_clust):
        ycl.append(float(i+1)/2.)
        for j in range(0,nhits):
            if c_labels[j]==i:
                pcentcl.append(data['pcent'][j])
                namecl.append(data['name'][j])
                detailcl.append(data['detail'][j])
                x1tcl.append(data['x1t'][j])
                x2tcl.append(data['x2t'][j])
                dxtcl.append(data['dxt'][j])
                break

    new_data = dict(x1=x1cl, x2=x2cl, dx=[end-beg for beg,end in zip(x1cl,x2cl)],
                    xm=[(beg+end)/2 for beg,end in zip(x1cl,x2cl)],
                    x1t=x1tcl, x2t=x2tcl, dxt=dxtcl,
                    y=ycl, nhits=cl.nhits, name=namecl, pcent=pcentcl, detail=detailcl)

    return new_data, n_clust
Example #5
0
def main(args):
    #-----------------------------------------------------#
    #             2D/3D Convolutional Autoencoder         #
    #-----------------------------------------------------#
    if args.program == 'CAE':
        cae = CAE(input_dir=args.data_dir,
                  patch_size=ast.literal_eval(args.patch_size),
                  batch_size=args.batch_size,
                  test_size=args.test_size,
                  prepare_batches=args.prepare_batches)

        cae.prepare_data(args.sampler_type, args.max_patches, args.resample,
                         ast.literal_eval(args.patch_overlap),
                         args.min_lab_vox, args.label_prob, args.load_data)
        if args.model_dir is None:
            cae.train(args.epochs)
        cae.predict(args.model_dir)

    #-----------------------------------------------------#
    #               Patient classification                #
    #-----------------------------------------------------#
    """
    if args.program=='AutSeg':
        asg = AutomaticSegmentation(    model_name=args.model_name,
                                        patch_size=args.patch_size,
                                        patch_overlap=args.patch_overlap,
                                        input_dir=args.data_dir, 
                                        model_dir=args.model_dir   )
        asg.run()
        asg.run_postprocessing()

"""
    if args.program == 'CLUS':
        clustering = Clustering(num_iters=args.iterations,
                                num_clusters=args.num_clusters,
                                input_dir=args.data_dir)
        clustering.run()

    if args.program == 'FeEx':
        fe = FeatureExtraction(model_name=args.model_name,
                               patch_size=ast.literal_eval(args.patch_size),
                               patch_overlap=ast.literal_eval(
                                   args.patch_overlap),
                               num_clusters=args.num_clusters,
                               cluster_selection=args.cluster_selection,
                               resample=args.resample,
                               encoded_layer_num=args.encoded_layer_num,
                               model_dir=args.model_dir,
                               input_dir=args.data_dir)
        fe.run(batch_size=20)

    if args.program == 'SVM':
        svm = SvmClassifier(feature_dir=args.feature_dir,
                            ffr_dir=args.ffr_dir,
                            ffr_filename=args.ffr_filename,
                            input_dir=args.data_dir,
                            ffr_cut_off=args.ffr_cut_off,
                            test_size=args.test_size)
        svm.train()
        svm.predict()
Example #6
0
    def generate(self, keys, url):
        json_work("other_files/work_file.json", "w", [])  # обнуляем work

        print(f'Ключей получено: {len(keys)}')

        if len(keys) > 0:
            self.generate_pretmp(
                keys
            )  # генерация претемплейтов по ключам c уникальным stemming
            print(f'Ключей после удаления дублей: {len(self.work_file)}')
            time.sleep(2)
            if len(self.work_file) > 0:
                with ThreadPoolExecutor(5) as executor:
                    for _ in executor.map(self.template_generated,
                                          self.work_file):
                        pass
                work = json_work("other_files/work_file.json", "r")
                if len(work) > 0:
                    gen_data = sorted(work,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/work_file.json", "w", gen_data)
                    gen_data += json_work("other_files/main.json", "r")
                    gen_data = sorted(gen_data,
                                      key=lambda x: x["frequency"]["basic"],
                                      reverse=True)
                    json_work("other_files/main.json", "w", gen_data)
                    print(f"url {url} обработан")
                    clustering = Clustering(
                        json_work("other_files/work_file.json", "r"), url)
                    clustering.run()
            else:
                print("Перехожу к следующему url")
        return
Example #7
0
def perform_clustering(
        term_ids_to_embs: Dict[int, List[float]]) -> Dict[int, Set[int]]:
    """Cluster the given terms into 5 clusters.

    Args:
        term_ids_to_embs: A dictionary mapping term-ids to their
            embeddings.
    Return:
        A dictionary of mapping each cluster label to its cluster.
        Each cluster is a set of term-ids.
    """
    # Case less than 5 terms to cluster.
    num_terms = len(term_ids_to_embs)
    if num_terms < 5:
        clusters = {}
        for i, tid in enumerate(term_ids_to_embs):
            clusters[i] = {tid}
        return clusters

    # Case more than 5 terms to cluster.
    c = Clustering()
    term_ids_embs_items = [(k, v) for k, v in term_ids_to_embs.items()]
    results = c.fit([it[1] for it in term_ids_embs_items])
    labels = results['labels']
    print('  Density:', results['density'])
    clusters = defaultdict(set)
    for i in range(len(term_ids_embs_items)):
        term_id = term_ids_embs_items[i][0]
        label = labels[i]
        clusters[label].add(term_id)
    return clusters
Example #8
0
 def cluster(self, shapelets):
     """
     Uses a clustering algorithm to reduce the number of shapelets.
     :param shapelets: list of shapelet candidates
     :type shapelets: np.array, shape = (len(shapelets), len(s), len(dim(s)))
     :return: list of remaining shapelet candidates
     :rtype np.array, shape = (|remaining candidates|, len(s), len(dim(s)))
     """
     clustering = Clustering(self.d_max)
     clustering.fit(shapelets)
     return clustering.nn_centers()
Example #9
0
    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))
Example #10
0
def clustering(x, df, n_clusters=10, distance='angular', method='K-medians'):
    """
  Do the clustering, based on the 91 features.
  Args:
	  x: array of features
	  df: dataframe of features
	  n_clusters: number of clusters
	  distance: could be 'angular' or 'euclidean';
      method: could be 'K-medians', 'K-means', 'Hierarchical'
  Output:
	  new_df: the labeled dataframe, according to the clustering algorithm
	  relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
	  cs: dictionary with the centroid features 
  """

    relevant_features_id = [
        0, 3, 5, 13, 15, 17, 25, 46, 47, 56, 64, 65, 76, 77, 83, 85, 90
    ]
    keys_dict = [
        '0-1', '0-4', '0-6', '1-2', '1-4', '1-6', '2-3', '4-5', '4-6', '5-7',
        '6-8', '6-9', '8-9', '8-10', '9-12', '10-11', '12-13'
    ]

    clustering_ = Clustering(k=n_clusters, distance=distance, method=method)
    cs, cls = clustering_.fit(x)

    assert len(list(cls.keys())) == n_clusters

    d = pd.DataFrame()
    l = []
    for i in range(n_clusters):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    relevant_features_cs = []
    if method == 'Hierarchical':
        pass
    else:
        for i in range(len(cs)):
            d = {}
            cs_rf = cs[i][relevant_features_id]
            for k in range(len(keys_dict)):
                d[keys_dict[k]] = cs_rf[k]
            relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs
Example #11
0
def cluster_data(data):
    etl = Etl()
    df = etl.process_data(data)
    df = etl.generate_rfm(df)
    df = etl.normalize_df(df)
    clustering = Clustering()
    [metrics, clusters] = clustering.generate_cluster(df)
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    try:
        requests.post(server_url + '/metrics', headers=headers, json=metrics)
        requests.post(server_url + '/clusters', headers=headers, json=clusters)
    except Exception as e:
        print('Error', e)
Example #12
0
def main(args, config):
    wDir = os.getcwd()
    #Instance Preprocessing class
    window = Preprocessing(args.fasta_file, config['win_length'], config['win_step'])
    window.output_window()
    print >> sys.stderr, "Creating windows_sequence.fasta"
    
    #Instance Similarity and Composition class
    sim = Similarity(args.fasta_file, config['score_adj'],wDir)
    sim_matrix = sim.mcl_perform() 
    comp_results = Composition(config['kmer_len'])
    comp_matrix = comp_results.joined()
    #Join similarity and composition matrix for PCA
    join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner')
    print >> sys.stderr, "Calculating similarity and composition matrix"
    
    #Instance Reduction class
    pca = Reduction(join, config['pca_comp'])
    pca_data = pca.perform_pca()
    print >> sys.stderr, "Performing PCA"
    
    #Instance Clustering class
    cluster = Clustering(pca_data)
    clust_obj = cluster.plot()
    print >> sys.stderr, "Performing clustering plot"
    
    #Instance ClusterReport class
    report = ClusterReport(clust_obj)
    file_name, querySeq = report.output_queryseq()
    print >> sys.stderr, "Doing report of clusters"

    #Instance Validate class
    valid = Validate(file_name, args.fasta_file,wDir)
    jfileComp, jfileMinus = valid.roundTwo()
    print >> sys.stderr, "Validation of results"
    
    #Instance ParseJplace Class
    parsing = ParseJplace(jfileComp, jfileMinus)
    corrMat = parsing.correlation()
    print >> sys.stderr, "Doing profiles"
    
    #Instance Profile Class
    ttest = Profiles(corrMat, querySeq)
    bestWin = ttest.windowsAssigment()
    print >>sys.stderr, "Doing permutations"
    
    #Instance StatsBinom
    finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin)
    finalResult.binomial()
    
    cleaning(file_name)
Example #13
0
    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
Example #14
0
def main():
    #X = [[1, 1], [1, 2],[1,3], [4, 4],[4, 5], [5, 4], [5, 5], [10, 9], [10,10], [20,19], [20, 20]]
    X, Y = make_blobs(n_samples=5000,
                      centers=10,
                      cluster_std=0.60,
                      random_state=0)
    cluster = Clustering(X.tolist())
    cluster.buildTree(cluster.root)

    cluster.createLevelMatrix(cluster.root)
    cluster.createDistanceMatrix(numberOfCluster, numberOfLevels)

    query = [0, 0]

    start = timeit.default_timer()
    # Your statements here
    print("aug", aug_mmr(cluster, 0.5, query, X, 15))
    stop = timeit.default_timer()
    print('Time for aug mmr: ', stop - start)

    start = timeit.default_timer()

    # Your statements here
    print("mmr", _mmr(0.5, query, X, 15))

    stop = timeit.default_timer()

    print('Time for mmr: ', stop - start)
Example #15
0
def main(fn, clusters_no):
    geo_locs = []
    #read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(fn)
    for index, row in df.iterrows():
        loc_ = Point(float(row['LAT']), float(row['LON']))  #tuples for location
        geo_locs.append(loc_)
    #run k_means clustering
    cluster = Clustering(geo_locs, clusters_no)
    flag = cluster.k_means(False)
    if flag == -1:
        print("Error in arguments!")
    else:
        #clustering results is a list of lists where each list represents one cluster
        print("Clustering results:")
        cluster.print_clusters(cluster.clusters)
 def init_run(self, run_params):
     if self.experiment_type == self.CLASSIFICATION:
         return MultiClassClassification(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             performance_function=self.performance_function,
             embeddings=self.node_embeddings,
             **run_params,
             node_labels=self.node_labels,
             node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.CLUSTERING:
         return Clustering(method_name=self.method_name,
                           dataset_name=self.dataset_name,
                           embeddings=self.node_embeddings,
                           **run_params,
                           node_labels=self.node_labels,
                           performance_function=self.performance_function,
                           node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.MULTI_LABEL_CLASSIFICATION:
         return MultiLabelClassification(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             node_labels=self.node_labels,
             **run_params,
             performance_function=self.performance_function,
             embeddings=self.node_embeddings,
             node2id_filepath=self.node2id_filepath)
     elif self.experiment_type == self.LINK_PREDICTION:
         return LinkPrediction(
             method_name=self.method_name,
             dataset_name=self.dataset_name,
             node_embeddings=self.node_embeddings,
             **run_params,
             performance_function=self.performance_function,
             node2id_filepath=self.node2id_filepath)
Example #17
0
def dump_clusters():

    args = get_args()
    if args['-train'] == '':
        args['-train'] = 'src/resources/output' + args['-k']
    w2vobj = W2V(args['-input'], args['-train'], args['-k'])

    news = News()
    articles = news.get_articles()
    w2vobj.train()
    # Sentence vectorization by averaging
    article_vecs = [w2vobj.get_sentence_vector_avg(article['cleaned_title']) for article in articles]

    # Sentence vectorization by "newtonian" method
    '''article_vecs = []
    for article in articles:
        newtonian_vec = w2vobj.get_sentence_vector_newtonian(article['cleaned_title'])
        if newtonian_vec is not None:
            article_vecs.append(newtonian_vec)'''

    cluster_obj = Clustering(article_vecs, w2vobj)
    r_conn = redis.from_url(os.getenv('REDIS_URL',"redis://localhost:6379/"))

    if args['-cluster'] == 'agg':
        if args['-prune'] == 'true' or args['-prune'] == 'True':
            utilities.redis_kmeans_clusters(cluster_obj, articles, True, int(args['-limit']), r_conn)
            print("redis dump complete")
        else:
            utilities.redis_kmeans_clusters(cluster_obj, articles, False, int(args['-limit']), r_conn)
            print("redis dump complete")
    else:
        #TODO dump to redis
        utilties.print_ann_clusters(cluster_obj, articles)
def printClusters(reduced_data,algo="kmean"):
	#Dessin des donnes avec matplotlib
	clust = Clustering(reduced_data,5)
	if(algo == "ga"):
		clust.GA(10)
	else:
		clust.kMeans()


	centroids, clusterAssment = clust.centroids, clust.clusterAssment

	cluster1X = []
	cluster1Y = []
	cluster2X = []
	cluster2Y = []
	cluster3X = []
	cluster3Y = []
	cluster4X = []
	cluster4Y = []
	cluster5X = []
	cluster5Y = []

	for i in range(len(reduced_data)):

		if(clusterAssment[i][0,0]==0):
			cluster1X.append(reduced_data[i,0])
			cluster1Y.append(reduced_data[i,1])
		if(clusterAssment[i][0,0]==1):
			cluster2X.append(reduced_data[i,0])
			cluster2Y.append(reduced_data[i,1])
		if(clusterAssment[i][0,0]==2):
			cluster3X.append(reduced_data[i,0])
			cluster3Y.append(reduced_data[i,1])
		if(clusterAssment[i][0,0]==3):
			cluster4X.append(reduced_data[i,0])
			cluster4Y.append(reduced_data[i,1])
		if(clusterAssment[i][0,0]==4):
			cluster5X.append(reduced_data[i,0])
			cluster5Y.append(reduced_data[i,1])

	plot(cluster1X,cluster1Y,'sg')
	plot(cluster2X,cluster2Y,'ob')
	plot(cluster3X,cluster3Y,'or')
	plot(cluster4X,cluster4Y,'mo')
	plot(cluster5X,cluster5Y,'ys')

	show()
Example #19
0
def filter_repeated_hits(data):
    cl = Clustering(10, 0.1) # cluster with default values
    cl.fill_clusters(data['x1'], data['x2'])

    toDelete = [False]*len(cl.clabels)
    for icl in range(cl.ncl): # loop over clusters
        if cl.nhits[icl]>50:
            counter = 0
            for i in range(len(cl.clabels)): # loop over hits
                if cl.clabels[i]==icl:
                    counter = counter + 1
                    if counter>50:
                        toDelete[i] = True

    for key in data.keys():
        data[key][:] = [value for value,flag in zip(data[key],toDelete) if not flag]
    return data
Example #20
0
def printClusters(reduced_data, algo="kmean"):
    #Dessin des donnes avec matplotlib
    clust = Clustering(reduced_data, 5)
    if (algo == "ga"):
        clust.GA(10)
    else:
        clust.kMeans()

    centroids, clusterAssment = clust.centroids, clust.clusterAssment

    cluster1X = []
    cluster1Y = []
    cluster2X = []
    cluster2Y = []
    cluster3X = []
    cluster3Y = []
    cluster4X = []
    cluster4Y = []
    cluster5X = []
    cluster5Y = []

    for i in range(len(reduced_data)):

        if (clusterAssment[i][0, 0] == 0):
            cluster1X.append(reduced_data[i, 0])
            cluster1Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 1):
            cluster2X.append(reduced_data[i, 0])
            cluster2Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 2):
            cluster3X.append(reduced_data[i, 0])
            cluster3Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 3):
            cluster4X.append(reduced_data[i, 0])
            cluster4Y.append(reduced_data[i, 1])
        if (clusterAssment[i][0, 0] == 4):
            cluster5X.append(reduced_data[i, 0])
            cluster5Y.append(reduced_data[i, 1])

    plot(cluster1X, cluster1Y, 'sg')
    plot(cluster2X, cluster2Y, 'ob')
    plot(cluster3X, cluster3Y, 'or')
    plot(cluster4X, cluster4Y, 'mo')
    plot(cluster5X, cluster5Y, 'ys')

    show()
Example #21
0
def full_realtime(precompute_fraction=.4,
                  nqueries=50000,
                  ndataunits=100000,
                  nmachines=50,
                  r=3,
                  np=.995,
                  min_q_len=6,
                  max_q_len=15,
                  ctype='fast',
                  gcpatype='better'):
    g = Graph.Erdos_Renyi(n=ndataunits, p=np / ndataunits)
    queries = []
    q = 0
    while q < nqueries:
        node = random.randint(0, ndataunits - 1)
        line = iterative_dfs(g, node, path=[])
        if len(line) >= min_q_len:
            queries.append(line)
            q += 1

    graphfile = 'n' + str(
        len(queries) / 1000) + 'np' + str(np) + ctype + gcpatype + 'test'
    with open(graphfile + '.csv', 'wb') as f:
        w = csv.writer(f)
        for line in queries:
            w.writerow(line)

    print 'Queries generated', len(queries)
    infile = graphfile
    max_to_process = min(nqueries, len(queries))
    queries = queries[:max_to_process]

    pre_computed = queries[:int(precompute_fraction * len(queries))]
    machines = generate(range(ndataunits), nmachines)
    dataunit_in_machine = generate_hash(machines, ndataunits)
    clustering = Clustering(pre_computed)

    rt_queries = queries[len(pre_computed):]

    if gcpatype == 'linear':
        gcpa_data = GCPA(clustering, ndataunits)
    elif gcpatype == 'better':
        gcpa_data = GCPA_better(clustering, ndataunits)

    gcpa_data.process(machines, dataunit_in_machine)

    rt_covers = []

    for idx, query in enumerate(rt_queries):
        oldlen = len(query)
        if (idx % 1000) == 0:
            print 'Query: ', idx
        cover, gcpa_dt = rt_query_process(query, clustering, gcpa_data,
                                          machines, dataunit_in_machine, ctype)

        rt_covers.append(cover)

    return gcpa_data.covers, rt_covers
Example #22
0
    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))
Example #23
0
def kga(data, k, random_state=None):
    rand.seed(random_state)
    problem = Clustering(data)
    centroids, _, _ = genetic(problem,
                              k,
                              t_pop=10,
                              taxa_cross=0.95,
                              taxa_mutacao=0.2)
    return centroids
Example #24
0
def setup(source, pdf_path):
    ngrams = NGramSpace(4)
    print "parsing documents at %s..." % source
    docs = [
        extract_row(row, pdf_path, ngrams)
        for row in csv.DictReader(open(source, 'r'))
    ]
    print "clustering %d documents..." % len(docs)
    clustering = Clustering([doc.parsed for doc in docs])
    return (clustering, docs)
Example #25
0
def start():
    # Set up logger
    logger = logging.getLogger('decoder')
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler = logging.StreamHandler()
    handler.setLevel(logging.ERROR)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # Set up components
    preprocessor = Preprocessor()
    clustering = Clustering(5)
    decoder = Decoder(logger)

    _input = Input()
    filename = _input.read_file('audio/testfile3.wav')
    # filename = _input.read_file('audio/testfile4.wav')

    preprocessor.read_csv(filename)
    # preprocessor.read_csv('simulation_2018-09-27_17-13-19.csv')
    preprocessor.plot()
    preprocessor.plot(True)

    preprocessor.process_loudness()
    preprocessor.plot()
    preprocessor.plot(True)

    training_batch = preprocessor.get_batch()
    labels = clustering.train(training_batch)
    mapping = clustering.get_label_mapping()
    signals = list()

    for label in labels:
        signals.append(mapping.get(label))

    for signal in signals:
        decoder.decode(signal)

    print(decoder.message)
Example #26
0
def do_compute(reference_txt, pre_clustering_txt, groundtruth_npy):

    # load reference clusters
    reference = Clustering.load(reference_txt)

    # load hypothesis clusters
    hypothesis = Clustering.load(pre_clustering_txt)

    # number of hypothesis clusters
    nPreClusters = len(hypothesis.clusters)
    preClusters = sorted(hypothesis.clusters)

    # groundtruth[i, j] contains
    # 1 if all elements in clusters i and j are in the same cluster
    # 0 if elements in clusters i and j are not in the same cluster
    # -1 if either cluster i or j is not pure
    groundtruth = np.empty((nPreClusters, nPreClusters), dtype=int)

    # clustersRef[c] contains reference cluster for pure hypothesis cluster c
    # in case c is not pure, clustersRef[c] is None
    clustersRef = {}
    for c in preClusters:
        r = set([reference[i] for i in hypothesis.clusters[c]])
        if len(r) == 1:
            clustersRef[c] = r.pop()
        else:
            clustersRef[c] = None

    for k, ci in enumerate(preClusters):
        if clustersRef[ci] is None:
            groundtruth[ci, :] = -1
            groundtruth[:, ci] = -1
            continue
        for cj in preClusters[k:]:
            if clustersRef[cj] is not None:
                groundtruth[ci, cj] = clustersRef[ci] == clustersRef[cj]
                groundtruth[cj, ci] = groundtruth[ci, cj]

    # save groundtruth matrix
    np.save(groundtruth_npy, groundtruth)
Example #27
0
def clustering(x, n_clusters):
    """
  Do the clustering, based on the 91 features. 
  We compute the reconstructed poses only with the following default parameters:
    method: 'K-Medians'
    distance: 'angular'
  Args:
    x: array of features
    n_clusters: number of clusters
  Output:
    new_df: the labeled dataframe, according to the clustering algorithm
    relevant_features_cs: a list with the relevant features (angles of the consecutive limbs) of the centroids
    cs: dictionary with the centroid features 
  """

    clustering_ = Clustering(k=n_clusters)
    cs, cls = clustering_.fit(x)
    d = pd.DataFrame()
    l = []
    for i in range(len(cs)):
        df1 = pd.DataFrame(cls[i])
        d = pd.concat([d, df1], sort=False)
        l += [i] * len(cls[i])

    d.columns = df.columns
    d.insert(91, 'label', l)

    new_df = df.reset_index().merge(d).set_index('index')

    assert len(cs) == n_clusters

    relevant_features_cs = []
    for i in range(len(cs)):
        d = {}
        cs_rf = cs[i][relevant_features_id]
        for k in range(len(keys_dict)):
            d[keys_dict[k]] = cs_rf[k]
        relevant_features_cs.append(d)

    return new_df, relevant_features_cs, cs
Example #28
0
def get_cv_cpv(x: str, percent: float) -> float:
    global model_goal
    # Get dataset number
    dataset_num = get_dataset_num(x)

    # Get number of pcs for CPV > 0.8 and CPV > 0.99
    if percent == 0.99:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.99)"]
    else:
        pcs_cpv = df_selection.loc[dataset_num, "Cum. Perc. Var. (0.8)"]

    # Get df_results
    df = pd.read_csv(x)
    idx = df.features_kept == pcs_cpv
    try:
        return df.loc[idx].cv.values[0]
    except:
        inputs = Inputs(paths)
        inputs.random_seed = 1969
        inputs.get_df_split(dataset_num)

        pca_model = get_pca_model(inputs)

        cluster_model = Clustering(inputs.num_cluster, 100, inputs.random_seed)
        cluster_model.fit(pca_model.pcs_train.loc[:, :pcs_cpv - 1])
        cluster_prediction = cluster_model.predict(
            pca_model.pcs_test.loc[:, :pcs_cpv - 1])
        cluster_performances = cluster_model.get_cluster_performances(
            inputs.df_test.copy(),
            cluster_prediction,
            pcs_cpv,
            inputs.num_cluster,
            model_goal=model_goal)
        return variation(cluster_performances)
Example #29
0
    def calculate_ref_wk(self, method, k):
        self.wk_refs = []

        for ref in range(self.refs.shape[2]):
            ref_clustering = Clustering(self.refs[:, :, ref], k)
            model, document_topic, word_topic = getattr(
                ref_clustering, method)()
            clusters = ref_clustering.document_topic.argmax(axis=1)
            wk_ref = self.calculate_wk(self.refs[:, :, ref], clusters)
            log_wk_ref = np.log(wk_ref)
            self.wk_refs.append(log_wk_ref)

        return self.wk_refs
Example #30
0
    def test_distance(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual(0, c.distance[0, 0])
        self.assertEqual(0.5, c.distance[1, 0])
        self.assertEqual(0, c.distance[1, 1])
        self.assertEqual(1.0, c.distance[2, 0])
        self.assertEqual(0.8, c.distance[2, 1])
        self.assertEqual(0, c.distance[2, 2])
Example #31
0
    def test_clustering(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)
Example #32
0
def precompute_clustering(pre_computed, machines, dataunit_in_machine):

    clustering = Clustering(pre_computed)
    # Indexed with the clusters. This array will store the necessary G-part information for each of the clusters
    parts_data = []
    ctr = 0

    for cluster in clustering.clusters: 
        print '%d out of %d'  % (ctr, len(clustering.clusters))
        ctr += 1
        part_covers, dataunit_in_parts = gcpa_precompute_rt(cluster, machines, dataunit_in_machine)
        parts_data.append((part_covers, dataunit_in_parts))

    return clustering, parts_data
Example #33
0
    def clusterize(self):
        print("\nclusterize")

        self.process_clustering_data()

        c = Clustering(self.clustering_df)
        c.k_means(2)
        c.k_means(3)
        c.k_means(4)
Example #34
0
def do_it(image_txt, features_npy, clustering_txt, output_npy):

    # load image list
    with open(image_txt, 'r') as f:
        images = [int(line.strip()) for line in f.readlines()]
        image2index = {image: index for index, image in enumerate(images)}

    # load hypothesis clusters
    clustering = Clustering.load(clustering_txt)
    clusters = sorted(clustering.clusters)

    # load features
    features = np.load(features_npy)

    # L2 normalization (for later dot product)
    features = (features.T / np.sqrt(np.sum((features**2), axis=1))).T

    # find centroid image for every cluster
    centroid = {}
    for c, cluster in enumerate(clusters):

        # list of images in current cluster
        _images = clustering.clusters[cluster]

        # corresponding indices in features matrix
        _indices = [image2index[image] for image in _images]

        # compute distance matrix between
        # all images of current cluster
        _features = features[_indices, :]
        _distance = 1. - np.dot(_features, _features.T)

        # find centroid image
        i = np.argmin(np.sum(_distance, axis=0))
        centroid[cluster] = _images[i]

        print 'image %s is centroid of cluster %s' % (centroid[cluster],
                                                      cluster)

    # centroid indices in features matrix
    _indices = [image2index[centroid[cluster]] for cluster in clusters]

    # compute distance matrix between all centroids
    _features = features[_indices, :]
    _distance = 1. - np.dot(_features, _features.T)

    # save distance matrix
    with open(output_npy, 'wb') as f:
        np.save(f, _distance)
Example #35
0
def do_it(image_txt, features_npy, clustering_txt, output_npy):

    # load image list
    with open(image_txt, 'r') as f:
        images = [int(line.strip()) for line in f.readlines()]
        image2index = {image: index for index, image in enumerate(images)}

    # load hypothesis clusters
    clustering = Clustering.load(clustering_txt)
    clusters = sorted(clustering.clusters)

    # load features
    features = np.load(features_npy)

    # L2 normalization (for later dot product)
    features = (features.T / np.sqrt(np.sum((features ** 2), axis=1))).T

    # find centroid image for every cluster
    centroid = {}
    for c, cluster in enumerate(clusters):

        # list of images in current cluster
        _images = clustering.clusters[cluster]

        # corresponding indices in features matrix
        _indices = [image2index[image] for image in _images]

        # compute distance matrix between
        # all images of current cluster
        _features = features[_indices, :]
        _distance = 1. - np.dot(_features, _features.T)

        # find centroid image
        i = np.argmin(np.sum(_distance, axis=0))
        centroid[cluster] = _images[i]

        print 'image %s is centroid of cluster %s' % (centroid[cluster], cluster)

    # centroid indices in features matrix
    _indices = [image2index[centroid[cluster]] for cluster in clusters]

    # compute distance matrix between all centroids
    _features = features[_indices, :]
    _distance = 1. - np.dot(_features, _features.T)

    # save distance matrix
    with open(output_npy, 'wb') as f:
        np.save(f, _distance)
Example #36
0
    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))
Example #37
0
    def test_clustering(self):
        raw_docs = ["a b c", "b c d", "d e f"]
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)
Example #38
0
    def test_nonseeded_clustering(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())
        c.merge(1, 0)
        self.assertEqual((2, 1), c.min_link())
        c.merge(2, 1)
        self.assertTrue(c.min_link() in [(4, 3), (5, 3)])
        c.merge(3, 4)
        c.merge(3, 5)
        self.assertEqual((7, 6), c.min_link())
def detect_meteors(rf_dir, id_dir, noise_dir, output_dir,
        t0=None, t1=None, rxch='zenith-l', txch='tx-h',
        snr_thresh=1, rmin_km=70, rmax_km=140, vmin_kps=7, vmax_kps=72,
        eps=0.5, min_samples=5, tscale=1, rscale=1, vscale=1,
        debug=False,
    ):
    """Function to detect and summarize meteor head echoes.


    Arguments
    ---------

    rf_dir : string or list
        RF data directory or directories.

    id_dir : string
        ID code metadata directory.

    noise_dir : string
        RX noise metadata directory.

    output_dir : string
        Meteor data output directory.

    t0 : float, optional
        Start time, seconds since epoch. If None, start at beginning of data.

    t1 : float, optional
        End time, seconds since epoch. If None, end at end of data.

    rxch : string, optional
        Receiver channel to process.

    txch : string, optional
        Transmitter channel.

    """
    # set up reader objects for data and metadata
    rfo = drf.DigitalRFReader(rf_dir)
    ido = drf.DigitalMetadataReader(id_dir)
    no = drf.DigitalMetadataReader(noise_dir)

    # infer time window to process based on bounds of data and metadata
    if t0 is None or t1 is None:
        bounds = []
        bounds.append(rfo.get_bounds(rxch))
        bounds.append(rfo.get_bounds(txch))
        bounds.append(ido.get_bounds())
        bounds.append(no.get_bounds())
        bounds = np.asarray(bounds)

        ss = np.max(bounds[:, 0])
        se = np.min(bounds[:, 1])

        fs = rfo.get_digital_rf_metadata(rxch)['samples_per_second']

    if t0 is None:
        s0 = ss
    else:
        s0 = int(np.uint64(t0*fs))

    if t1 is None:
        s1 = se
    else:
        s1 = int(np.uint64(t1*fs))

    # load pulse/coding information
    tmm = TimingModeManager.TimingModeManager()
    if os.path.exists('/tmp/tmm.hdf5'):
        tmm.loadFromHdf5('/tmp/tmm.hdf5', skip_lowlevel=True)
    else:
        tmm.loadFromHdf5(skip_lowlevel=True)

    # initalize generator that steps through data pulse by pulse
    pulse_data = data_generator(rfo, ido, no, tmm, s0, s1, rxch, txch)

    # initialize clustering object for grouping detections
    clustering = Clustering(eps, min_samples, tscale, rscale, vscale)

    # initialize CSV file for saving meteor clusters
    csvpath = os.path.join(output_dir, 'cluster_summaries.txt')
    csvfile = open(csvpath, "wb", 1) # 1 => use line buffering
    cols = mp.summarize_meteor(None)
    csvwriter = csv.DictWriter(csvfile, cols)
    csvwriter.writeheader()

    # loop that steps through data one pulse at a time
    for k, (tx, rx) in enumerate(pulse_data):
        # marching periods as status update
        if (k % 100) == 0:
            sys.stdout.write('.')
            sys.stdout.flush()

        # matched filter
        mf_rx = mp.matched_filter(tx, rx, rmin_km, rmax_km)

        # meteor signal detection
        meteors = mp.detect_meteors(mf_rx, snr_thresh, vmin_kps, vmax_kps)

        # clustering of detections into single meteor head echoes
        for meteor in meteors:
            sys.stdout.write('*')
            sys.stdout.flush()
            new_clusters = clustering.addnext(pulse_num=k, **meteor)
            for c in new_clusters:
                sys.stdout.write('{0}'.format(c.cluster.values[0]))
                # summarize head echo and save to a data file
                cluster_summary = mp.summarize_meteor(c, debug=debug)
                csvwriter.writerow(cluster_summary)

    # tell clustering object that data is exhausted and to return any final clusters
    new_clusters = clustering.finish()
    for c in new_clusters:
        # summarize head echo and save to a data file
        cluster_summary = mp.summarize_meteor(c)
        csvwriter.writerow(cluster_summary)

    csvfile.close()
class SequenceCollection(object):

    """
    Orchestrating class that should:
    a) work as a central repository for the information generated by the
       subordinate classes, and
    b) be the only class directly interacted with by the user

    TO DO:
    implement consistent naming of methods (where appropriate)
    Prefixes:
    get_[something]  - returns the object implied by something
    put_[something]  - puts something in the class data structure
    show_[something] - prints something to screen
    plot_[something] - displays a plot of something
    _[something]     - private method
    """

    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format="fasta",
        datatype="protein",
        helper="./class_files/DV_wrapper.drw",
        tmpdir="/tmp",
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print "!!!"
                print "There was a problem reading files from {0}".format(input_dir)
                print "!!!"
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print "!!!"
                print "There was a problem finding the darwin helper at {0}".format(helper)
                print "!!!"
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite)

    def __str__(self):
        s = "SequenceCollection object:\n"
        s += "Contains {0} alignments\n".format(self.length)
        return s

    def __len__(self):
        return self.length

    def get_files(self, input_dir, file_format="fasta"):
        """
        Get list of alignment files from an input directory
        *.fa, *.fas and *.phy files only
        Stores in self.files
        """

        if file_format == "fasta":
            files = glob.glob("{0}/*.fa".format(input_dir))
            if len(files) == 0:
                files = glob.glob("{0}/*.fas".format(input_dir))
        elif file_format == "phylip":
            files = glob.glob("{0}/*.phy".format(input_dir))
        else:
            print "Unrecognised file format %s" % file_format
            files = None
        if not files:
            print "No sequence files found in {0}".format(input_dir)
            return 0
        return sorted(files)

    def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w"))

    def hash(self, string):
        H = hashlib.sha1(string)
        return H.hexdigest()

    def gzip(self, filename):

        if not filename.endswith(".gz"):
            filename += ".gz"

        cPickle.dump(self, file=gz.open(filename, "wb"), protocol=-1)

    @classmethod
    def gunzip(cls, filename):

        return cPickle.load(gz.open(filename, "rb"))

    def put_records(self, files=None, record_list=None, file_format="fasta", datatype="protein"):
        """
        Reads sequence files from the list generated by
        get_files and stores in self.records
        """

        get_name = lambda i: i[i.rindex("/") + 1 : i.rindex(".")]

        if files and not record_list:
            record_list = [TCSeqRec(f, file_format=file_format, name=get_name(f), datatype=datatype) for f in files]
        elif not files and not record_list:

            print "Can't load records - no records or alignment files given"
            return

        records_to_keys = dict([(record.name, number) for (number, record) in enumerate(record_list)])
        keys_to_records = dict(enumerate(record_list))
        self.records = record_list
        self.length = len(record_list)
        self.records_to_keys = records_to_keys
        self.keys_to_records = keys_to_records

    def load_phyml_results(self, input_dir, records=None, use_hashname=False, program="phyml"):

        if not records:
            records = self.get_records()
        failures = []
        for rec in records:
            if use_hashname:
                name = rec.hashname()
            else:
                name = rec.name
            tree_file = "{0}/{1}.phy_phyml_tree.txt".format(input_dir, name)
            stats_file = "{0}/{1}.phy_phyml_stats.txt".format(input_dir, name)

            try:
                rec.tree.load_phyml_results(tree_file, stats_file, name=rec.name, program=program)
            except FileError:
                failures.append(rec.name)

        if failures:
            print "Couldn't load results for the following records:"
            for f in failures:
                print "   ", f

    def sanitise_records(self):
        """
        Sorts records alphabetically, trims whitespace from beginning
        of record headers, removes '/' characters from headers,
        replaces spaces with underscores, puts sequences into upper case
        """

        for rec in self.get_records():
            rec.sanitise()

    def put_dv_matrices(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        for rec in self.get_records():
            rec.dv = [rec.get_dv_matrix(tmpdir=tmpdir, helper=helper, overwrite=overwrite)]

    def put_trees(
        self,
        rec_list=None,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir=None,
        overwrite=True,
        verbose=False,
    ):

        if tmpdir is None:
            tmpdir = self.tmpdir
        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        for rec in rec_list:
            if overwrite is False:
                if rec.name in self.inferred_trees:
                    continue
            if program == "treecollection":
                tree = rec.get_TC_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "raxml":
                tree = rec.get_raxml_tree(tmpdir=tmpdir, overwrite=overwrite)
            elif program == "phyml":
                tree = rec.get_phyml_tree(
                    model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite, verbose=verbose
                )
            elif program == "bionj":
                tree = rec.get_bionj_tree(
                    model=model,
                    datatype=datatype,
                    tmpdir=tmpdir,
                    ncat=ncat,
                    optimise=optimise,
                    overwrite=overwrite,
                    verbose=verbose,
                )
            self.inferred_trees[rec.name] = tree

    def put_distance_matrices(self, metrics, tmpdir="/tmp", normalise=False):
        """
        Pass this function a list of metrics
        valid kwargs - invert (bool), normalise (bool)
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        trees = [rec.tree for rec in self.get_records()]
        for metric in metrics:
            dm = DistanceMatrix(trees, tmpdir=tmpdir)
            dm.get_distance_matrix(metric, normalise=normalise)
            self.distance_matrices[metric] = dm

    def put_partition(self, metric, cluster_method, nclusters, prune=True, tmpdir=None, recalculate=False):

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        partition_vector = self.Clustering.run_clustering(
            self.distance_matrices[metric], cluster_method, nclusters, prune=prune, recalculate=recalculate
        )

        self.clusters_to_partitions[(metric, cluster_method, nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return partition_vector

    def put_partition_vector(self, partition_vector, name):
        """
        Given a partition vector (i.e. a tuple containing the class-
        membership for each gene alignment), inserts the relevant data
        structures into the SequenceCollection object.
        NEXT: run concatenate_records(), put_cluster_trees()
        """

        self.clusters_to_partitions[name] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)

    def put_partitions(self, metrics, cluster_methods, nclusters, prune=True, tmpdir=None, recalculate=False):
        """
        metrics, linkages and nclasses are given as lists, or coerced into
        lists
        """

        if not isinstance(metrics, list):
            metrics = [metrics]
        if not isinstance(cluster_methods, list):
            cluster_methods = [cluster_methods]
        if not isinstance(nclusters, list):
            nclusters = [nclusters]
        if tmpdir is None:
            tmpdir = self.tmpdir
        else:
            nclusters = sorted(nclusters, reverse=True)

        # names = [rec.name for rec in self.get_records()]

        for metric in metrics:
            print "Clustering {0} data".format(metric)
            self.Clustering.clear_cache()
            for cluster_method in cluster_methods:
                print " ", cluster_method
                for n in nclusters:
                    key = (metric, cluster_method, n)
                    if key in self.clusters_to_partitions:
                        continue
                    else:
                        self.put_partition(
                            metric, cluster_method, n, prune=prune, tmpdir=tmpdir, recalculate=recalculate
                        )

    def concatenate_records(self):
        for p in self.partitions.values():
            p.concatenate_records(self.keys_to_records)
            for concat in p.concats:
                if not concat[0].name in self.concats:
                    self.concats[concat[0].name] = concat

    def autotune(
        self,
        metric,
        prune=True,
        KMeans=True,
        recalculate=True,
        tmpdir=None,
        max_groups=None,
        min_groups=2,
        check_single=True,
    ):
        """
        Uses Perona and Zelnick-Manor's spectral rotation method to determine
        the number of clusters present in the data
        """

        if not tmpdir:
            tmpdir = self.tmpdir
        if not metric in self.get_distance_matrices():
            self.put_distance_matrices(metric, tmpdir=tmpdir)
        dm = self.get_distance_matrices()[metric]

        if check_single and min_groups > 1:
            print "Checking for single cluster..."
            (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
                dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=6, min_groups=1, verbose=False
            )
            if nclusters == 1:
                print "Single cluster found."
                print "Quality Scores: {0}".format(quality_scores)

                self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
                self.partitions[partition_vector] = Partition(partition_vector)
                return (partition_vector, quality_scores)
            else:
                print ">1 clusters found."
                print "Quality Scores: {0}".format(quality_scores)
                recalculate = False

        (partition_vector, nclusters, quality_scores) = self.Clustering.run_spectral_rotate(
            dm, prune=prune, KMeans=KMeans, recalculate=recalculate, max_groups=max_groups, min_groups=min_groups
        )

        self.clusters_to_partitions[(metric, "rotate", nclusters)] = partition_vector
        self.partitions[partition_vector] = Partition(partition_vector)
        return (partition_vector, quality_scores)

    def put_cluster_trees(
        self,
        program="treecollection",
        model=None,
        datatype=None,
        ncat=4,
        optimise="n",
        tmpdir="/tmp",
        overwrite=True,
        max_guide_trees=True,
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if program == "treecollection":
            return self._put_best_TC_trees(tmpdir=tmpdir, overwrite=overwrite, max_guide_trees=max_guide_trees)
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        self.put_trees(
            rec_list=rec_list,
            program=program,
            model=model,
            ncat=ncat,
            optimise=optimise,
            datatype=datatype,
            tmpdir=tmpdir,
            overwrite=overwrite,
        )
        self.update_scores()

    def _put_best_TC_trees(self, tmpdir="/tmp", overwrite=True, max_guide_trees=-1):
        rec_list = self.get_cluster_records_with_memberships()
        for (rec, members) in rec_list:
            print "Calculating treecollection tree for {0}".format(rec.name),
            if rec.name in self.inferred_trees and overwrite == False:
                print "Skipping - already calculated (overwrite set to False)"
                continue
            guidetrees = [self.keys_to_records[member].tree for member in members]
            if max_guide_trees > 0:
                guidetrees = guidetrees[:max_guide_trees]
            TCtrees = []
            pref = rec._write_temp_tc(make_guide_tree=False, tmpdir=tmpdir)
            pref = "{0}/{1}".format(tmpdir, pref)
            dv_file = pref + "_dv.txt"
            labels_file = pref + "_labels.txt"
            map_file = pref + "_map.txt"
            if len(guidetrees) > 1:
                print "(using best of {0} guidetrees)".format(len(guidetrees))
            else:
                print "(using single guidetree)"
            for t in guidetrees:
                guidetree_file = "{0}/{1}.nwk".format(tmpdir, t.name)
                n = t.reroot_newick()
                with open(guidetree_file, "w") as writer:
                    writer.write(n)
                TCtrees.append(Tree.new_treecollection_tree(dv_file, map_file, labels_file, guidetree_file, rec.name))
            best = min(TCtrees, key=lambda x: x.score)
            rec.tree = best
            self.inferred_trees[rec.name] = best
        self.update_scores()

    def update_scores(self):
        for partition in self.partitions.values():
            partition.update_score(self.concats)

    def _pivot(lst):
        new_lst = zip(*lst)
        return ["".join(x) for x in new_lst]

    def concatenate_list_of_records(self, records=None):
        if not records:
            records = self.get_records()
        concat = copy.deepcopy(records[0])
        for rec in records[1:]:
            concat += rec
        return concat

    def make_randomised_copy(self, tmpdir=None, get_distances=False, parallel_load=False, overwrite=True):

        shuffled_records = self.get_randomised_alignments()
        if not tmpdir:
            tmpdir = self.tmpdir
        randomised_copy = SequenceCollection(
            input_dir=None,
            records=shuffled_records,
            file_format=self.file_format,
            datatype=self.datatype,
            helper=self.helper,
            tmpdir=tmpdir,
            get_distances=get_distances,
            parallel_load=parallel_load,
            overwrite=overwrite,
        )
        return randomised_copy

    def show_memberships(self):

        partitions = self.get_partitions()
        for compound_key in partitions:
            print " ".join(str(x) for x in compound_key)
            partition = partitions[compound_key]
            print partition
            print self.clustering.get_memberships(partition)

    def simulate_from_record(
        self, record, output_dir, name, tmpdir, datatype=None, allow_nonsense=False, split_lengths=None, gene_names=None
    ):

        if not datatype:
            datatype = self.datatype
        if datatype == "protein":
            SeqSim.simulate_from_record_WAG(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        elif datatype == "dna":
            SeqSim.simulate_from_record_GTR(record, output_dir, name, tmpdir, allow_nonsense, split_lengths, gene_names)
        else:
            print "datatype {0} is not recognised".format(datatype)

    def simulate_from_result(self, key, output_dir, name, tmpdir, datatype=None, allow_nonsense=False):

        if not datatype:
            datatype = self.datatype
        p = self.get_partition(key)
        for c in p.concats:
            updated_record = self.concats[c.name][0]  # bug: records in Partition

            # objects aren't linked
            # to trees

            members = c.name.split("-")
            lengths = [self.keys_to_records[int(x)].seqlength for x in members]
            names = ["sim" + self.keys_to_records[int(x)].name for x in members]
            self.simulate_from_record(
                updated_record,
                output_dir,
                name=name,
                tmpdir=tmpdir,
                allow_nonsense=allow_nonsense,
                split_lengths=lengths,
                gene_names=names,
            )

    #######################
    # Getters
    #######################

    def get_trees(self):
        return [rec.tree for rec in self.get_records()]

    def get_cluster_records(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return [rec for (rec, _) in sorted(self.concats.values(), key=sort_key)]

    def get_cluster_records_with_memberships(self):
        """
        Returns all concatenated records from cluster analysis
        """

        sort_key = lambda item: tuple(
            (int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item[0].name)
        )
        return sorted(self.concats.values(), key=sort_key)

    def get_cluster_trees(self):
        records = self.get_cluster_records()
        trees = [rec.tree for rec in records]
        return trees

    def get_score(self, key):
        return self.get_partition(key).score

    def get_partition(self, key):
        partition_vector = self.clusters_to_partitions[key]
        return self.partitions[partition_vector]

    def get_membership(self, key, flatten=False):
        return self.get_partition(key).get_membership(flatten=flatten)

    def get_partitions(self):
        return [(k, self.partitions[v]) for (k, v) in self.clusters_to_partitions.items()]

    def get_memberships(self, flatten=False):
        return [
            (k, self.partitions[v].get_membership(flatten=flatten)) for (k, v) in self.clusters_to_partitions.items()
        ]

    def get_scores(self):
        return [(k, self.partitions[v].score) for (k, v) in self.clusters_to_partitions.items()]

    def get_randomised_alignments(self):
        lengths = [rec.seqlength for rec in self.get_records()]
        names = self.get_names()
        datatype = self.records[0].datatype
        concat = self.concatenate_list_of_records()
        concat.shuffle()
        newrecs = concat.split_by_lengths(lengths, names)
        return newrecs

    def get_records(self):
        """
        Returns list of stored sequence records
        """

        return [self.keys_to_records[i] for i in range(self.length)]

    def get_names(self):
        """
        Returns a list of the names of the stored records
        """

        return [rec.name for rec in self.get_records()]

    def get_seqlengths(self):
        """
        Returns a list of the sequence lengths of the stored records
        """

        return [rec.seqlength for rec in self.get_records()]

    def get_distance_matrices(self):
        return self.distance_matrices

    def get_dv_matrices(self):
        dvs = {}
        for rec in self.get_records():
            dvs[rec.name] = rec.dv
        return dvs

    #########################
    # Plotters
    #########################

    def plot_dendrogram(self, metric, link, nclasses, show=True):

        plot_object = self.clustering.plot_dendrogram((metric, link, nclasses))
        if show:
            plot_object.show()
        return plot_object

    def plot_heatmap(self, distance_matrix, partition, outfile=None):

        sort_partition = partition.get_membership(flatten=True)
        fig = distance_matrix.plot_heatmap(sort_partition=sort_partition)
        if outfile:
            fig.savefig("{0}.pdf".format(outfile))
        return fig

    def plot_embedding(
        self,
        partition_vector,
        distance_matrix,
        embedding="MDS",
        prune=True,
        dimensions=3,
        centre_of_mass=False,
        outfile=None,
        standardize=False,
        normalise=False,
        annotate=False,
    ):
        """
        Plots an embedding of the trees in a Principal Coordinate space,
        and saves as pdf.
        """

        dm = distance_matrix.matrix
        partition_vector = np.array(partition_vector)
        labels = self.get_names()
        if embedding == "MDS":
            dbc = self.Clustering.get_double_centre(dm)
            (vals, vecs, var_exp) = self.Clustering.get_eigen(dbc, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        elif embedding == "spectral":
            laplacian = self.Clustering.spectral(dm, prune=prune)

            (vals, vecs, var_exp) = self.Clustering.get_eigen(laplacian, standardize=standardize)
            (coords, _) = self.Clustering.get_coords_by_dimension(vals, vecs, var_exp, 3, normalise=normalise)
        else:
            print "embedding should be one of 'MDS' or 'spectral'"
            print "value given was:", embedding
            return
        min_Z = min([z for (x, y, z) in coords])
        P = []  # get the indices of the partition vector for each group

        # and store in this list

        max_groups = max(partition_vector)
        for i in range(1, max_groups + 1):
            partition = np.where(partition_vector == i)
            P.append(partition)

        colors = "bgrcmyk"
        coldict = {"b": "blue", "g": "green", "r": "red", "c": "cyan", "m": "magenta", "y": "yellow", "k": "black"}
        fig2d = plt.figure()
        fig3d = plt.figure()
        ax2d = fig2d.add_subplot(111)
        ax3d = fig3d.add_subplot(111, projection="3d")

        for (pos, partition) in enumerate(P):
            for i in partition[0]:
                ax2d.scatter(color=colors[pos % len(colors)], *(coords[i])[:2])
                ax3d.scatter(color=colors[pos % len(colors)], *coords[i])
                ax3d.plot(
                    [coords[i][0], coords[i][0]],
                    [coords[i][1], coords[i][1]],
                    [min_Z, coords[i][2]],
                    color="grey",
                    linewidth=0.2,
                )

                if annotate:
                    ax2d.annotate(
                        labels[i],
                        xy=(coords[i][0], coords[i][1]),
                        xytext=(-20, 20),
                        textcoords="offset points",
                        fontsize="x-small",
                        ha="right",
                        va="bottom",
                        bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5),
                        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
                    )

            if centre_of_mass:
                com = np.mean(coords[partition], axis=0)
                ax2d.scatter(color="k", marker="x", s=2, *com[:2])
                ax3d.scatter(color="k", marker="x", s=2, *com)
        if embedding == "spectral" and normalise:
            (u, v) = np.mgrid[0 : 2 * np.pi : 20j, 0 : np.pi : 10j]
            x = np.cos(u) * np.sin(v)
            y = np.sin(u) * np.sin(v)
            z = np.cos(v)
            ax3d.plot_wireframe(x, y, z, color="grey", linewidth=0.2)

        ax2d.set_xlabel("PCo1")
        ax2d.set_ylabel("PCo2")
        ax2d.set_title("Trees embedded in dimension-reduced space")
        ax3d.set_xlabel("PCo1")
        ax3d.set_ylabel("PCo2")
        ax3d.set_zlabel("PCo3")
        ax3d.set_title("Trees embedded in dimension-reduced space")
        if outfile:
            fig2d.savefig("{0}-2d.pdf".format(outfile))
            fig3d.savefig("{0}-3d.pdf".format(outfile))
        return (fig2d, fig3d)

    #########################
    # Parallelisers
    #########################

    def _unpack_dv(self, packed_args):
        return packed_args[0].get_dv_matrix(*packed_args[1:])

    def _dv_parallel_call(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        nprocesses = min(self.length, multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, self.length)
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in self.get_records():
            new_dir = tmpdir + "/" + rec.name
            if not os.path.isdir(new_dir):
                os.mkdir(new_dir)
            args.append((rec, tmpdir + "/" + rec.name, helper, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_dv, args, callback=results.append)
        r.wait()
        for (w, x, y, z) in args:
            if os.path.isdir(x):
                os.rmdir(x)
        results = results[0]
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results))

    def put_dv_matrices_parallel(self, tmpdir="/tmp", helper="./class_files/DV_wrapper.drw", overwrite=True):

        dv_matrices_dict = self._dv_parallel_call(tmpdir, helper, overwrite=overwrite)
        for rec in self.get_records():
            rec.dv = [dv_matrices_dict[rec.name]]

    def _unpack_bionj(self, packed_args):
        return packed_args[0].get_bionj_tree(*packed_args[1:])

    def _bionj_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=1, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_bionj, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_phyml(self, packed_args):
        return packed_args[0].get_phyml_tree(*packed_args[1:])

    def _phyml_parallel_call(self, model=None, datatype=None, rec_list=None, ncat=4, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = min(len(rec_list), multiprocessing.cpu_count() - 1)
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, model, datatype, ncat, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_phyml, args, callback=results.append)
        r.wait()
        print "Results obtained, closing pool..."
        pool.close()
        pool.join()
        print "Pool closed"
        return dict(zip(names, results[0]))

    def _unpack_raxml(self, packed_args):
        return packed_args[0].get_raxml_tree(*packed_args[1:])

    def _raxml_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_raxml, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def _unpack_TC(self, packed_args):
        return packed_args[0].get_TC_tree(*packed_args[1:])

    def _TC_parallel_call(self, rec_list=None, tmpdir="/tmp", overwrite=True):

        if not rec_list:
            rec_list = self.records
        nprocesses = multiprocessing.cpu_count() - 1
        print "Initialising a pool of {0} processes running {1} jobs...".format(nprocesses, len(rec_list))
        pool = multiprocessing.Pool(nprocesses)
        results = []
        args = []
        names = []
        for rec in rec_list:
            args.append((rec, tmpdir, overwrite))
            names.append(rec.name)
        r = pool.map_async(self._unpack_TC, args, callback=results.append)
        r.wait()
        pool.close()
        pool.join()
        return dict(zip(names, results[0]))

    def put_trees_parallel(
        self, rec_list=None, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if not program in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        if not rec_list:
            rec_list = self.records
        if program == "treecollection":
            trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        elif program == "bionj":
            trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, tmpdir=tmpdir, ncat=ncat, overwrite=overwrite
            )
        for rec in self.get_records():
            rec.tree = trees_dict[rec.name]
            self.inferred_trees[rec.name] = trees_dict[rec.name]

    def put_cluster_trees_parallel(
        self, program="treecollection", model=None, datatype=None, ncat=4, tmpdir="/tmp", overwrite=True
    ):

        if program not in ["treecollection", "raxml", "phyml", "bionj"]:
            print "unrecognised program {0}".format(program)
            return
        rec_list = self.get_cluster_records()
        print "Inferring {0} cluster trees".format(len(rec_list))
        if program == "treecollection":
            cluster_trees_dict = self._TC_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "raxml":
            cluster_trees_dict = self._raxml_parallel_call(rec_list=rec_list, tmpdir=tmpdir, overwrite=overwrite)
        elif program == "phyml":
            cluster_trees_dict = self._phyml_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        elif program == "bionj":
            cluster_trees_dict = self._bionj_parallel_call(
                rec_list=rec_list, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir, overwrite=overwrite
            )
        for rec in rec_list:
            rec.tree = cluster_trees_dict[rec.name]
        self.update_results()
Example #41
0
	file = open(filename, "w")

	for k, v in cluster.iteritems():
		for photo in v:
			file.write("%d\t%d\n" % (photo, k))

	file.close()

print "Loading json into memory..."
dictionary = readjson("/vol/corpora4/mediaeval/2014/SED_2014_Dev_Metadata.json")
print "...Done !"

clusterU = clusterUser(dictionary, fileID)
clusterD = clusterDate(dictionary, fileID, clusterU)

print_result_file(clusterD, fileOUT)

reference = Clustering.load(fileREF)
hypothesis = Clustering.load(fileOUT)

images = []
for c in clusterD.values():
	for i in range (0, len(c)):
		images.append(c[i])

h = homogeneity(reference, hypothesis, images)
print h
c = completeness(reference, hypothesis, images)
print c

    def __init__(
        self,
        input_dir=None,
        records=None,
        file_format="fasta",
        datatype="protein",
        helper="./class_files/DV_wrapper.drw",
        tmpdir="/tmp",
        get_distances=False,
        parallel_load=False,
        overwrite=True,
    ):

        # Unset Variables

        # Store some mappings for data retrieval

        self.records_to_keys = {}
        self.keys_to_records = {}
        self.clusters_to_partitions = {}
        self.partitions = {}
        self.distance_matrices = {}
        self.concats = {}
        self.inferred_trees = {}
        self.Clustering = Clustering()

        # Store some data

        self.files = None
        self.file_format = file_format
        self.datatype = datatype
        self.records = []
        self.length = 0
        self.helper = helper

        # Set Variables

        self.tmpdir = tmpdir

        # Lambda for sorting by name and number

        sort_key = lambda item: tuple((int(num) if num else alpha) for (num, alpha) in re.findall(r"(\d+)|(\D+)", item))

        # Can give an input directory as optional argument
        # If given:
        #    read the alignment files
        #    optionally calculate pairwise distances
        #    store the sequence data

        if input_dir:

            files = self.get_files(input_dir, file_format)

            # file checks

            if files == 0:
                print "!!!"
                print "There was a problem reading files from {0}".format(input_dir)
                print "!!!"
                sys.exit()

            if get_distances and not os.path.isfile(helper):
                print "!!!"
                print "There was a problem finding the darwin helper at {0}".format(helper)
                print "!!!"
                sys.exit()

            # done

            files.sort(key=sort_key)
            self.put_records(files=files, record_list=None, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()
            if not os.path.isdir(tmpdir):
                os.mkdir(tmpdir)
        elif records:

            # Can optionally give record objects directly if no input dir specified

            self.put_records(files=None, record_list=records, file_format=file_format, datatype=datatype)

            # takes care of self.length for us

            self.sanitise_records()

        # Optionally use Darwin to calculate pairwise distances

        if get_distances and self.records:
            if parallel_load:
                self.put_dv_matrices_parallel(helper=helper, tmpdir=tmpdir, overwrite=overwrite)
            else:
                self.put_dv_matrices(helper=helper, tmpdir=tmpdir, overwrite=overwrite)