def addPoint(self, point):
     if len(self.clusters) == 0:
         cl = cluster()
         cl.addPoint(point)
         self.clusters.append(cl)
     else:
         neighbouringClusters = self.numberOfNeighbouingClusters(point)
         if neighbouringClusters == 0:
             cl = cluster()
             cl.addPoint(point)
             self.clusters.append(cl)
             return
         elif neighbouringClusters == 1:
             for cl in self.clusters:
                 if cl.isInside(point):
                     cl.addPoint(point)
                     return
         else:
             newCluster = cluster()
             newCluster.addPoint(point)
             for cl in self.clusters[:]:
                 if cl.isInside(point):
                     for pnt in cl.points:
                         newCluster.addCheckedPoint(pnt)
                     self.clusters.remove(cl)
             self.clusters.append(newCluster)
Example #2
0
    def dbscan(self, D, eps, MinPts):
        self.dataSet = D

        title(r'dbscan_demo Algorithm', fontsize=18)
        xlabel(r'Dim 1', fontsize=17)
        ylabel(r'Dim 2', fontsize=17)

        C = -1
        Noise = cluster('Noise')

        for point in D:
            if point not in self.visited:
                self.visited.append(point)
                NeighbourPoints = self.regionQuery(point, eps)

                if len(NeighbourPoints) < MinPts:
                    Noise.addPoint(point)
                else:
                    name = 'Cluster' + str(self.count)
                    C = cluster(name)
                    self.count += 1
                    self.expandCluster(point, NeighbourPoints, C, eps, MinPts)

                    plot(C.getX(), C.getY(), 'o', label=name)
                    hold(True)

        if len(Noise.getPoints()) != 0:
            plot(Noise.getX(), Noise.getY(), 'x', label='Noise')

        hold(False)
        legend(loc='lower left')
        grid(True)
        show()
Example #3
0
    def dbscan(self, MinPts, eps):
        '''

        :param data: The input data points or features
        :param MinPts:
        :param eps:
        :return:
        '''
        self.cluster_count=0
        Noise=cluster.cluster("Noise")
        for point in self.dataset:
            if point not in self.visited_points:
                self.visited_points.append(point)
            else:
                continue
            neighbour_points=self.region_query(point, eps)
            if len(neighbour_points) < MinPts:
                Noise.add(point)
            else:
                cluster_name="cluster "+ str(self.cluster_count)
                C=cluster.cluster(cluster_name)
                self.cluster_count+=1
                self.expand_cluster(point, neighbour_points, C, eps, MinPts)
                plt.plot(C.get_syllable(), C.get_usage(), 'o', label=cluster_name, hold = True)

        if len(Noise.getPoints()) != 0:
            plt.plot(Noise.get_syllable(), Noise.get_usage(), 'x', label='Noise', hold = False)
        plt.show()
Example #4
0
def ellipseDetect(input):
    img, data = input
    params = cv.SimpleBlobDetector_Params()

    area = img.shape[0] * img.shape[1]

    minArea = area * 0.000125
    maxArea = area * 0.002
    print(area, minArea, maxArea)

    bestcount = 0

    # use varying maximum threshold, keep the largest number
    # of blobs found (if it's less than 8)

    for maxthresh in range(20, 225, 10):
        params.thresholdStep = 10.0
        params.minThreshold = 10
        params.maxThreshold = maxthresh  #220.0

        params.filterByArea = True
        params.minArea = minArea
        params.maxArea = maxArea

        params.filterByColor = False

        params.filterByCircularity = True
        params.minCircularity = 0.7

        params.filterByConvexity = True
        params.minConvexity = 0.8

        params.filterByInertia = True
        params.minInertiaRatio = 0.5

        params.minRepeatability = 2
        params.minDistBetweenBlobs = 10.0

        detector = cv.SimpleBlobDetector_create(params)
        keypoints = detector.detect(img)

        keypoints = cluster.cluster(keypoints, 5)
        if keypoints is not None:
            count = len(keypoints)
            for p in keypoints:
                print(p.pt, p.size)
            if count <= 8 and count > bestcount:
                bestcount = count
                bestpoints = keypoints

    if bestcount > 0:
        keypoints = bestpoints
        print(keypoints)
        img = cv.drawKeypoints(img, keypoints, None, (255, 0, 0),
                               cv.DrawMatchesFlags_DRAW_RICH_KEYPOINTS)
    else:
        keypoints = list()
        print("No ellipses found")
    return (img, keypoints, True, ("blob ellipses",
                                   "%d keypoints" % bestcount))
Example #5
0
    def operate(self, routes, address_field, city_field, province_field,
                zip_field, start, end):
        done = 0
        err = 0

        for i, route in enumerate(routes):

            columns = self.__read_columns(
                route, [address_field, city_field, province_field, zip_field],
                start, end)

            if columns.shape[0] < 1:
                err += 1
                continue

            addresses = [
                Address(columns.iloc[i, 0], columns.iloc[i, 1],
                        columns.iloc[i, 2], columns.iloc[i, 3]).return_val()
                for i in range(columns.shape[0])
            ]

            labels, self.km[route] = cluster(addresses)

            columns["Label"] = [f"L{label}" for label in labels]

            self.__write_df(columns,
                            f"{route}_sheet",
                            close=i + 1 == len(routes))

            done += 1

        return done, err
    def optimal_ponctual_charge(self, atomindex):
        atom, index = atomindex
        nb1 = self.cluster_1.get_global_atom_index(atom, index)
        pop = self.cluster_1.lowdin_charge(nb1) - self.cluster_1[atomindex][0]
        coords = self.cluster_1[atomindex][1]

        project_temp = cluster.cluster(self.cluster_1.wfn)

        project_temp.atoms_coords = self.cluster_1.atoms_coords
        project_temp.atoms_charges = self.cluster_1.atoms_charges
        project_temp.atoms_names = self.cluster_1.atoms_names
        project_temp.atoms_indexes = self.cluster_1.atoms_indexes
        project_temp.atoms_nb = self.cluster_1.atoms_nb

        project_temp['BQ', None] = (-self.cluster_1[atomindex][0], coords)
        del project_temp[atomindex]

        g = [pop * (1 / 3)]
        d = [pop * (2 / 3)]

        project_temp['MP', None] = (g[0], coords)
        nb2 = len(np.where(self.cluster_1.env_names == 'BQ')[0])
        g.append(pop - project_temp.lowdin_charge(nb1 + nb2))
        index_mp = len(np.where(self.cluster_1.env_names == 'MP')[0])
        print(index_mp)
        project_temp['MP', index_mp] = (d[0], coords)
        d.append(pop - project_temp.lowdin_charge(nb2))
        a = (d[1] - g[1]) / (d[0] - g[0])
        b = d[1] - d[0] * a
        print(nb1)
        print(coords)
        print(pop)
        print(nb2)
        print(-b / a)
        return -b / a
    def eval_func_confidences(self, feature_weights):
        weights_sum = float(sum(feature_weights))
        # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0
        feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights]
        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1)
        reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session)

        actuals = []
        confidences = []

        confidence_vectors = []
        for feature, weight in zip(self.features, feature_weights):
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confs = cluster(self.cluster_type, 2, feature_vectors)
                for i, confidence in enumerate(confs, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([])
                    confidence_vectors[vi].append(confidence * weight)
                    vi += 1
                    
        for doc in reduced_docs:
            for span in doc._spans:
                actual = 1 if doc.span_is_plagiarized(span) else 0
                actuals.append(actual)

        for vec in confidence_vectors:
            confidences.append(min(1, sum(vec)))

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        print 'evaluated:', roc_auc, [w for w in feature_weights]
        return roc_auc
Example #8
0
def simulate(N, M, D, S, G, dt):
    """Simulate function from Computation book modified to take in
    user specified variables N, D, S, G, dt and output a user specified
    file name.

    :param N: The number of particles.
    :param M: The number of clusters.
    :param D: The number of dimensions.
    :param S: The number of time steps.
    :param G: Gravitational constant.
    :param dt: The time step.
    :return: Simulation complete message.
    """
    x = cl.cluster(
        N, M, D, 500
    )  # create cluster object. 500 defines the maximum position coordinate of a cluster.
    x0, v0, m = x.cluster(
    )  # x0, v0, m = book.initial_cond(N, D) <-- old way to initialize
    for s in range(S):
        with open("clusterdata" + str(s + 1) + ".dat", "w") as myfile:
            for i in range(M):
                x1, v1 = book.timestep(x0[i], v0[i], G, m[i], dt)
                x0[i], v0[i] = x1, v1
                myfile.write(
                    str(x0[i]).replace('[', '').replace(']', '') + "\n")
            myfile.flush()
    return '\nSimulation complete. Your data has been saved as clusterdata*.dat\n'
Example #9
0
def ksplit2(data_list,ks_crit):
    
    centroids,labels_init = kmeans2(data_list,2,minit='points')
    p = 0
    l = 2
    
    while p < l+1:
        r = data_list[labels_init == p]        
        ks,centroids,labels = cluster(r)
                
        if ks > ks_crit:
            # This is the condition for a good cluster
            p += 1
            continue
        elif ks == -1:
            break
        else:
            m = where(labels_init == p)
            for i in range(2):
                n = where(labels == i)  
                labels[n[0]] = l
                l += 1
            p += 1
            labels_init[m[0]] = labels    
                
    return centroids,labels_init,ks_crit
Example #10
0
def run_each_frame(file, topo):
    alld, pbc = sca.get_coord(file, topo)
    res_d = {}  # contains information for each frame

    avg_dict = {}
    count = 0

    for frame in alld:
        a, b, c = pbc[frame]
        d, links_h, links_o, hbonds, obonds = data.data(alld[frame], a, b, c)

        coord = np.array([d[i][1:] for i in d])

        temp_d = cluster.cluster(coord, file)

        for i in temp_d:
            temp_d[i] = temp_d[i]
            if i not in avg_dict:
                avg_dict[i] = temp_d[i]
            else:
                avg_dict[i] += temp_d[i]
        count += 1

        res_d[frame] = temp_d

    print file, 'Number of frames', len(res_d)
    for i in avg_dict:
        avg_dict[i] = avg_dict[i] / count

    np.save(file[:-6] + '_CMS_perFrame.npy', res_d)
    np.save(file[:-6] + '_CMS_avg.npy', avg_dict)
Example #11
0
    def filterClusters(self):
        if len(self.pairwiseDict) == 0:
            self.pairwise()
        amap = AAmap()

        for i in xrange(0, len(self.atoms)):
            c = cluster(self.pdb, self.top, self.pfam, '', '', self.seqheader,
                        '', '', self.center, self.cutoff, self.scutoff,
                        self.flag, 1.0, self.desc)
            c.addNeighbor(amap, self.atoms[i], i)  # put itself in first
            nbnum = 0
            for j in xrange(0, len(self.atoms)):
                key = "%d-%d" % (i, j)
                if (self.pairwiseDict[key] <= self.cutoff) and (abs(i - j) >=
                                                                self.scutoff):
                    c.addNeighbor(amap, self.atoms[j], j)
                    nbnum = nbnum + 1
                    c.thetaPhi.append(
                        self.calculateThetaPhi(self.atoms[i], self.atoms[j]))
            if nbnum < self.nbcutoff:
                continue

            c.pdbidx = c.pdbidx.lstrip()  # will change meanDist
            c.pdbResSeq = c.pdbResSeq.lstrip()
            meanDist = self.clusterMeanDist(c)
            if meanDist < 5.8:
                print('%s,%0.2f,%s,%s,%s,%s') % (self.pdb, meanDist, ''.join(
                    sorted(c.str)), ''.join(sorted(
                        c.typeStr)), c.pdbResSeq, self.getSphericalStr(c))
                self.clusters.append(c)
Example #12
0
def kMean(data):
    global globalSse
    global globalCs
    # Step 1 - Pick K random points as cluster centers called centroids.
    cs = []
    idx = np.random.randint(data.shape[0], size=K)
    centers = np.take(X, idx, axis=0)
    count = 0
    lastScore = 0
    for center in centers:
        cs.append(cluster(center))
    while True:
        # Step 2 - Assign each xi to nearest cluster by calculating its distance to each centroid.
        for c in cs:
            c.clearData()

        for idx, x in enumerate(X):
            cIndex = minDistCentroid(cs, x)
            cs[cIndex].data.append(idx)

        # Step 3 - Find new centroid by the new clusters
        for c in cs:
            newCentroid = np.mean(np.take(X, c.data, 0), axis=0)
            c.updateCenter(newCentroid)

        sse = str(wc(cs, data))
        # Step 4 - break if certain iteration met
        count += 1
        if count == 10000 or lastScore == sse:
            printResult(cs, sse)
            globalSse = sse
            globalCs = cs
            break
        lastScore = sse
def getRecmmendListController(uid):
  #从数据库读取用户特征
  users = getAllUserInfo()
  #特征向量化
  userData = userFeatureVectorize(users)
  #对用户聚类
  labels, centers = cluster(userData,4)
  #printUserLabel(users,labels)
  #return
  #获取uid所属类别
  pos = findUserPos(uid,users)
  if pos == -1:
    return 'uid is not in db'
  userLabel = labels[pos]
  #获取所有组内用户id
  ids = getGroup(labels,userLabel,users)
  #获取所有用户关键词搜索数
  usersSearchInfo = getAllUserSearchInfo()
  usersInfo = geneUserModel(users,usersSearchInfo,userData,labels)
  #获取组内所有用户关键字搜索数
  group = getGroupInfo(ids,usersInfo)
  #协同过滤获取用户所有关键字搜素数
  recList = collFilter(uid,group,userData)
  #拼凑结果给用户
  allKey = getAllKeyInfo()
  jsonStr = geneResult(recList,allKey)
  return jsonStr
def identify_cluster(X_data_df,
                     threshold=0.7,
                     correlation_id_method='pearson'):
    cor = X_data_df.corr(method=correlation_id_method)

    clusters = []
    for j, col in enumerate(cor.columns):
        for i, row in enumerate(cor.columns[0:j]):
            if abs(cor.iloc[i, j]) > threshold:
                current_pair = (col, row, cor.iloc[i, j])
                current_pair_added = False
                for _c in clusters:
                    if _c.can_accept(current_pair):
                        _c.update_with(current_pair)
                        current_pair_added = True
                if current_pair_added == False:
                    clusters.append(cluster(pairs=[current_pair]))
    final_clusters = []

    #It is possible to have clusters with shared nodes which which is not desirable. Here we merge the cluster that share nodes.
    for _cluster in clusters:
        added_to_final = False
        for final_c in final_clusters:
            if _cluster.nodes.intersection(final_c.nodes) != set():
                final_c.merge_with_cluster(_cluster)
                added_to_final = True
        if added_to_final == False:
            final_clusters.append(_cluster)
    for i, _cluster in enumerate(final_clusters):
        _cluster.name = f'cluster_{i}'
    return final_clusters
Example #15
0
 def __init__(self, width, height):
     self.width = width
     self.height = height
     self.origin = np.array([self.width / 2, self.height / 2])
     self.root = entity(
         point(self.origin, np.array([0., 0.]), np.array([0., 0.])),
         np.array([0., 0., 0.]))
     self.cluster = cluster(self.root)
Example #16
0
 def index():
     global clusters
     if request.method == 'POST':
         eps = float(request.form.get('eps', 20))
         min_samples = int(request.form.get('min_samples', 2))
         clusters = cluster(mat, fnames, eps, min_samples)
         print(clusters)
     return render_template('index.html', clusters=clusters)
Example #17
0
def agglomerative(data):
    # first, every single point is a cluster
    global globalSse
    global globalCs
    global distDictionary
    cs = []
    heap = distHeap()
    for idx, x in enumerate(X):
        c = cluster(x)
        c.data.append(idx)
        cs.append(c)
    for i, x in enumerate(cs):
        for j in range(i + 1, len(cs)):
            dist = cluster_distance(cs[i], cs[j])
            clusters = (i, j)
            distDictionary[clusters] = dist
            # print "dist" + str(dist) + " " + str(i) + " " + str(j)
            heap.add_clusters(cs[i], cs[j], dist)

    while True:
        # print len(cs)
        if len(cs) <= K:
            sse = str(wc(cs, data))
            printResult(cs, sse)
            globalSse = sse
            globalCs = cs
            break
        c1, c2 = heap.min_dist_clusters()

        # merge c1 and c2
        newData = np.concatenate((c1.data, c2.data))
        # Append the new cluster into the new cs
        c = cluster(np.mean(np.take(X, newData, 0), axis=0))
        c.data = newData
        # remove associate c1 and c2
        heap.remove_cluster(c1, c2)
        cs.remove(c1)
        cs.remove(c2)
        # add new
        for c_ind in cs:
            # print 'add new dist'
            dist = cluster_distance_cached(c, c_ind)
            heap.add_clusters(c, c_ind, dist)
        # print c1.data
        # print c2.data
        cs.append(c)
Example #18
0
def cluster_recs():
    native_lang = request.json['native_lang']
    uid = request.json['uid']
    clust_num = request.json['clust_num']
    percent = request.json['percent']

    pop_clusters = cluster(native_lang, uid, clust_num, percent)

    return json.dumps(pop_clusters)
Example #19
0
 def run(
     self,
     num_processes,
     sample=None
 ):  #assigns a same cluster # to records that are the same business entity
     return cluster.cluster(self.df,
                            cluster='cluster',
                            to='to',
                            match='match').run(
                                self.get_matches(num_processes, sample))
def main():
  hotelStarDict = {
    '五星级':5,
    '五星型':5,
    '四星级':4,
    '四星型':4,
    '三星级':3,
    '三星型':3,
    '二星级':2,
    '二星型':2,
    '经济级':1,
    '经济型':1
  }

  sexDict = {
    '男':0,
    '女':1
  }

  hobbyDict = {}

  ff = open('userInfo.txt')
  lines = ff.readlines()
  count = 0
  for line in lines:
    id, userName, age, sex, hobby, consumLevel, hotelStar = line.strip()[1:-1].split(',')
    if hobby not in hobbyDict:
      hobbyDict[hobby] = count
      count += 1
  hobbyFeat = [0]*len(hobbyDict)
  userData = []
  userIds = []
  for line in lines:
    id, userName, age, sex, hobby, consumLevel, hotelStar = line.strip()[1:-1].split(',')
    age = int(age)
    sex = sexDict[sex]
    hobbyFeature = hobbyFeat[:]
    hobbyFeature[hobbyDict[hobby]] = 1
    hobby = hobbyFeature
    consumLevel = int(consumLevel)
    hotelStar = hotelStarDict[hotelStar]
    feature = [age, sex, consumLevel, hotelStar]
    feature.extend(hobby)
    userData.append(feature)
    userIds.append([id,userName])
  ff.close()
  userData = utils.matNorm(userData)
  '''for data in userData:
    print data
  return'''

  labels, centers = cluster.cluster(userData,4)
  for i,id in enumerate(userIds):
    print id[1],labels[i]
def run(filename, min_size, max_size, dist):
    noun_dict = noun_extractor.get_nouns(filename)
    (synset_list, synset_dict) = cluster.get_synset_list(noun_dict)
    matrix = cluster.gen_sim_matrix(synset_list)
    clustering = cluster.format_clustering(cluster.cluster(matrix), synset_list)
    clusters = cluster.get_clusters(clustering, synset_list, min_size, max_size, dist=dist)
    clusters = filter(lambda x: x[0] is not None, clusters)
    cluster_counts = cluster.get_cluster_counts(clusters, synset_dict)
    # sort clusters by noun counts, most frequent first
    sorted_clusters = [x[1] for x in sorted(enumerate(clusters), key=lambda x: cluster_counts[x[0]], reverse=True)]
    hypernyms = filter(lambda x: x is not None, map(lambda x: lca(x), sorted_clusters))
    return Classification(noun_dict, synset_list, synset_dict, matrix, clustering, sorted_clusters, hypernyms)
Example #22
0
def main():
    
    calib_path = raw_input('Enter folder path for calibration parameters:\n')
    if not os.path.isdir(calib_path):
        print('Invalid path!')
        return
        
    in_path = raw_input('Enter folder path for input images:\n')
    if not os.path.isdir(in_path):
        print('Invalid path!')
        return
            
    out_path = raw_input('Enter folder path for output images:\n')
    if not os.path.isdir(out_path):
        os.mkdir(out_path)
    
    undistort.undistort(calib_path, in_path, out_path)

    cluster.cluster(os.path.join(out_path, undistort), out_path)
    
    find_contour.find_contour(os.path.join(out_path, cluster), out_path)
Example #23
0
 def categorize(self, topics):
     self.seen.append(' '.join(topics))
     if len(self.seen) > 100:
         self.seen = self.seen[1:]
     cluster.updateLanguage(self.seen[-1].split(' '))
     if len(self.seen) < 10:
         return self.boot.categorize(topics)
     cats = getLabels(cluster.cluster(self.seen))
     for cat in cats:
         if self.seen[-1] in cats[cat]:
             return cat
     return 'I dunno'
Example #24
0
    def top_local_stop_structure_gt(self, threshold):
        global_threshold = len(self.pages) * threshold
        gt_clusters = []
        for item in set(self.ground_truth):
            gt_clusters.append(cluster())
            for i in range(len(self.ground_truth)):
                if self.ground_truth[i] == item:
                    gt_clusters[item].addPage(self.pages[i])
            print str(item) + "\t" + str(len(gt_clusters[item].pages))

        print "number of gt cluster is " + str(len(gt_clusters))
        print "number of cluster 5 is " + str(len(gt_clusters[4].pages))
        gt_clusters[4].find_local_stop_structure(self.df, global_threshold)
Example #25
0
    def top_local_stop_structure_gt(self,threshold):
        global_threshold = len(self.pages) * threshold
        gt_clusters = []
        for item in set(self.ground_truth):
            gt_clusters.append(cluster())
            for i in range(len(self.ground_truth)):
                if self.ground_truth[i] == item:
                    gt_clusters[item].addPage(self.pages[i])
            print str(item) + "\t" + str(len(gt_clusters[item].pages))

        print "number of gt cluster is " + str(len(gt_clusters))
        print "number of cluster 5 is " + str(len(gt_clusters[4].pages))
        gt_clusters[4].find_local_stop_structure(self.df,global_threshold)
Example #26
0
def mainSetK(peaks, expectedK):
    # means = seed.pickInitMeans(peaks,expectedK)
    #
    # numMeans = expectedK
    # numMatrixSeeds = int(0.75 * numMeans) + 1
    # means = matrixSeed.pickInitSeeds(peaks,numMatrixSeeds)
    # means += seed.pickInitMeans(peaks,numMeans-numMatrixSeeds)
    #
    means = matrixSeed.pickInitSeeds(peaks, expectedK)
    print peaks[0]
    #The extra list at the beginning is for outliers,and is initialized with all peaks
    clusters = [peaks] + clustrifyMeans(means)
    alignmentMatrix = align.generate_align_matrix(peaks, means)
    clusterVariances = [
        0
    ] * 5  #just something so that the first Welch's test doesn't cause termination
    print 'first runthrough of clustering'
    (means, clusters) = cluster.cluster(peaks, means, alignmentMatrix)
    varAlignmentMatrix = align.generate_var_align_matrix(clusters)
    print 'starting welch\'s t-test clustering with centroid means'
    (p_val, clusterVariances) = welchTest(clusters, varAlignmentMatrix,
                                          clusterVariances)
    while p_val < probabilityThreshold:
        means = paring.paredMeans(means, varAlignmentMatrix)
        numNewMeans = guessNewMeansSetK(peaks, means, p_val, expectedK)
        #currently, no correlation between how many means duplicated/dropped in paring
        #and how many and from where they are added in mean picking
        # means += seed.pickMeans(peaks, numNewMeans)
        means += seed.pickNewMeansOutliersToRandom(clusters, numNewMeans)
        # means += seed.pickNewMeans(clusters, numNewMeans, clusterVariances)
        alignmentMatrix = align.generate_align_matrix(peaks, means)
        (means, clusters) = cluster.cluster(peaks, means, alignmentMatrix)
        varAlignmentMatrix = align.generate_var_align_matrix(clusters)
        (p_val, clusterVariances) = welchTest(clusters, varAlignmentMatrix,
                                              clusterVariances)
        print 'finished clustering of subsequent k guess'
    return clusters
Example #27
0
def get_plagiarism(text, atom_type, features, cluster_method, k):
    '''
    Return a list of tuples of the form [((0, 18), .5), ((20, 45),  .91), ...]
    In each tuple there is a span tuple and a confidence. The span tuple
    corresponds to an atom of the document and the confidence value corresponds
    to how confident we are that that span was plagiarized. 
    '''
    # Create a FeatureExtractor
    feature_extractor = FeatureExtractor(text)
    # get feature vectors
    feature_vecs = feature_extractor.get_feature_vectors(features, atom_type)
    # cluster
    confidences = cluster(cluster_method, k, feature_vecs)
    # Return it
    return zip(tokenize(text, atom_type), confidences) # should feature extractor have a method that returns the spans it used instead?
Example #28
0
def get_plagiarism(text, atom_type, features, cluster_method, k):
    '''
    Return a list of tuples of the form [((0, 18), .5), ((20, 45),  .91), ...]
    In each tuple there is a span tuple and a confidence. The span tuple
    corresponds to an atom of the document and the confidence value corresponds
    to how confident we are that that span was plagiarized. 
    '''
    # Create a FeatureExtractor
    feature_extractor = FeatureExtractor(text)
    # get feature vectors
    feature_vecs = feature_extractor.get_feature_vectors(features, atom_type)
    # cluster
    confidences = cluster(cluster_method, k, feature_vecs)
    # Return it
    return zip(tokenize(text, atom_type), confidences) # should feature extractor have a method that returns the spans it used instead?
Example #29
0
def p_a_comp_post(PARequest):  # noqa: E501
    """Request the execution of a placement algorithm.

    Request the execution of a placement algorithm. The caller needs to implement a callback function and supply the relevant URI so that the PA can post there the result of its execution.- # noqa: E501

    :param PARequest: Placement algorithm request information.
    :type PARequest: dict | bytes

    :rtype: PAResponse
    """
    if connexion.request.is_json:
        pa_req = connexion.request.get_json()

    pa_req = cluster(pa_req)

    return best_garrote(pa_req)
Example #30
0
    def _get_clusters(self, nbins=36):
        clusters = cluster.cluster(self.path, key=self.key, nbins=nbins)

        def _name_cluster(idx):
            if idx < 10:
                return str(idx)
            elif idx < nbins:
                return chr(idx - 10 + 65)
            elif idx == nbins:
                return '|'

        out = {}

        for idx,segs in clusters.items():
            out[_name_cluster(int(idx))] = [self._segments[int(aidx)] for st,dur,aidx in segs]

        return out
    def construct_confidence_vectors_dataset(self, reduced_docs, features,
                                             session):
        from cluster import cluster
        conf_dataset = SupervisedDataSet(len(features), 1)

        confidence_vectors = []
        num_trues = 0
        for feature in features:
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confidences = cluster("outlier",
                                      2,
                                      feature_vectors,
                                      center_at_mean=True,
                                      num_to_ignore=1,
                                      impurity=.2)
                for i, confidence in enumerate(confidences, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([[], 0])
                    if doc.span_is_plagiarized(doc._spans[i]):
                        t = 1
                        num_trues += 1
                    else:
                        t = 0
                    confidence_vectors[vi][0].append(confidence)
                    confidence_vectors[vi][1] = t
                    vi += 1

        num_plagiarised = num_trues / len(features)
        print num_plagiarised

        shuffle(confidence_vectors)
        for vec in confidence_vectors:
            if vec[1] == 0:
                num_plagiarised -= 1
            if not (vec[1] == 0 and num_plagiarised <= 0):
                conf_dataset.addSample(vec[0], vec[1])

        f = open(self.dataset_filepath, 'wb')
        cPickle.dump(conf_dataset, f)
        print 'dumped dataset file'

        return conf_dataset
Example #32
0
    def train(self, pos_samples):
        def sig_gen_cb(left, right):
            samples = [
                pos_samples[s] for s in left['samples'] + right['samples']
            ]
            new_sig = bayes.Bayes(minlen=self.minlen,
                                  kmin=self.kmin,
                                  kfrac=self.kfrac,
                                  prune=True,
                                  statsfile=self.statsfile,
                                  threshold_style='min',
                                  max_fpos=self.max_fp_count,
                                  training_trace=self.fpos_training_streams)

            new_sig.train(samples)
            #                score = min([new_sig.score(s) for s in samples])
            score = new_sig.threshold
            token_scores = new_sig.token_scores.values()
            if self.max_tokens_in_est:
                token_scores.sort(lambda x, y: cmp(y, x))
                score = sum(token_scores[:self.max_tokens_in_est])
            else:
                score = sum(token_scores)
            return (new_sig, score)

        import cluster
        self.clusters = cluster.cluster(
            sig_gen_cb,
            self.spec_threshold,
            pos_samples,
            max_fp_count=self.max_fp_count,
            fpos_training_streams=self.fpos_training_streams,
            bound_similarity=self.bound_similarity)

        if self.threshold_style != 'min':
            for c in self.clusters:
                if c['sig']:
                    c['sig'].set_threshold()

        sigs = []
        for c in self.clusters:
            if len(c['samples']) >= self.min_cluster_size:
                sigs.append(c['sig'])
        return sigs
Example #33
0
    def _get_clusters(self, nbins=36):
        clusters = cluster.cluster(self.path, key=self.key, nbins=nbins)

        def _name_cluster(idx):
            if idx < 10:
                return str(idx)
            elif idx < nbins:
                return chr(idx - 10 + 65)
            elif idx == nbins:
                return '|'

        out = {}

        for idx, segs in clusters.items():
            out[_name_cluster(int(idx))] = [
                self._segments[int(aidx)] for st, dur, aidx in segs
            ]

        return out
def clustering(fcallstacks_pool, show_plot, total_time, delta, bound):
    import math
    
    ''' 1. Preparing data '''
    data=[]
    for cs in fcallstacks_pool:
        data.append([
            cs.repetitions[cs.rank], 
            cs.instants_distances_mean])

    normdata=normalize_data(data)
    #plot_data(normdata)
    
    ''' 2. Perform clustering '''
    db = DBSCAN(eps=constants._eps, min_samples=constants._min_samples).fit(normdata)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels=db.labels_

    ''' 3. Creating cluster objects '''
    nclusters = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_pool=[]
    for i in range(0, nclusters):
        clusters_pool.append(cluster(i))
                
    assert len(labels) == len(fcallstacks_pool)
    for i in range(0,len(labels)):
        callstack_cluster_id=labels[i]
        fcallstacks_pool[i].cluster_id=callstack_cluster_id
        if fcallstacks_pool[i].cluster_id == -1: # Owned by no cluster
            continue

        clusters_pool[callstack_cluster_id].add_callstack(fcallstacks_pool[i])

    ''' 4. Show plots '''
    #if show_plot:
    show_plot_thread=multiprocessing.Process(
            target=show_clustering,
            args=(data, fcallstacks_pool, labels, core_samples_mask, nclusters, 
                total_time, delta, bound))
    #show_plot_thread.start()

    return clusters_pool, show_plot_thread
Example #35
0
def get_plagiarism_passages(text, atom_type, features, cluster_method='none', k=2):
    '''
    Return a list of passages, each of which contains
    a starting/ending index, its text, its atom_type, and a dictionary of
    its features
    '''
    # Extract passage objects (including their feature vectors)
    feature_extractor = FeatureExtractor(text)
    passages = feature_extractor.get_passages(features, atom_type)
    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)
    
    # List of passages with plag. conf. set
    return passages
Example #36
0
def get_plagiarism_passages(text, atom_type, features, cluster_method='none', k=2):
    '''
    Return a list of passages, each of which contains
    a starting/ending index, its text, its atom_type, and a dictionary of
    its features
    '''
    # Extract passage objects (including their feature vectors)
    feature_extractor = FeatureExtractor(text)
    passages = feature_extractor.get_passages(features, atom_type)
    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)
    
    # List of passages with plag. conf. set
    return passages
Example #37
0
def main():
    # matplotlib.use('qt5agg')
    import matplotlib.pyplot as plt
    from matplotlib.patches import Rectangle

    init_model()
    MAT_DIR = './mat/test'
    LABEL_DIR = './label/test'
    for dirpath, dirnames, filenames in os.walk(MAT_DIR):
        print(dirpath)
        for filename in filenames:
            if filename == 'full.mat':
                data = sio.loadmat(os.path.join(dirpath, filename))
                img = data['data']
                centers = detection(img)
                img_id = dirpath.split('/')[-1]
                label_file = os.path.join(LABEL_DIR, img_id + '.mat')
                labels = sio.loadmat(label_file)['label']
                distance = (lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2))

                centers = cluster(centers)

                TP = 0
                for x, y in labels:
                    for x_, y_ in centers:
                        if distance(x, y, x_, y_) < 36:
                            TP += 1
                            break
                precision = float(TP) / len(centers)
                recall = float(TP) / len(labels)
                f_score = 2 * (precision * recall) / (precision + recall)
                six.print_(precision, recall, f_score)

                f = open(dirpath.split('/')[-1] + '-predict.txt', 'w')
                for x, y in centers:
                    f.write(str(x) + ' ' + str(y) + '\n')
                f.close()
                f = open(dirpath.split('/')[-1] + '-label.txt', 'w')
                for x, y in labels:
                    f.write(str(x) + ' ' + str(y) + '\n')
                f.close()
def main():
    # matplotlib.use('qt5agg')
    import matplotlib.pyplot as plt
    from matplotlib.patches import Rectangle

    init_model()
    MAT_DIR = './mat/test'
    LABEL_DIR = './label/test'
    for dirpath, dirnames, filenames in os.walk(MAT_DIR):
        print(dirpath)
        for filename in filenames:
            if filename == 'full.mat':
                data = sio.loadmat(os.path.join(dirpath, filename))
                img = data['data']
                centers = detection(img)
                img_id = dirpath.split('/')[-1]
                label_file = os.path.join(LABEL_DIR, img_id + '.mat')
                labels = sio.loadmat(label_file)['label']
                distance = (lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2))

                centers = cluster(centers)

                TP = 0
                for x, y in labels:
                    for x_, y_ in centers:
                        if distance(x, y, x_, y_) < 36:
                            TP += 1
                            break
                precision = float(TP) / len(centers)
                recall = float(TP) / len(labels)
                f_score = 2 * (precision * recall) / (precision + recall)
                six.print_(precision, recall, f_score)

                f = open(dirpath.split('/')[-1] + '-predict.txt', 'w')
                for x, y in centers:
                    f.write(str(x) + ' ' + str(y) + '\n')
                f.close()
                f = open(dirpath.split('/')[-1] + '-label.txt', 'w')
                for x, y in labels:
                    f.write(str(x) + ' ' + str(y) + '\n')
                f.close()
Example #39
0
    def train(self, pos_samples):
            def sig_gen_cb(left, right):
                samples = [pos_samples[s] for s in left['samples'] + 
                          right['samples']]
                new_sig = bayes.Bayes(minlen=self.minlen, 
                                      kmin=self.kmin, kfrac=self.kfrac, 
                                      prune=True, 
                                      statsfile=self.statsfile,
                                      threshold_style='min',
                                      max_fpos=self.max_fp_count,
                                      training_trace=self.fpos_training_streams)
                                  
                new_sig.train(samples)
#                score = min([new_sig.score(s) for s in samples])
                score = new_sig.threshold
                token_scores = new_sig.token_scores.values()
                if self.max_tokens_in_est:
                    token_scores.sort(lambda x,y: cmp(y,x))
                    score = sum(token_scores[:self.max_tokens_in_est])
                else:
                    score = sum(token_scores)
                return (new_sig, score)

            import cluster
            self.clusters = cluster.cluster(sig_gen_cb, self.spec_threshold, 
                                   pos_samples, 
                                   max_fp_count=self.max_fp_count,
                                   fpos_training_streams=self.fpos_training_streams,
                                   bound_similarity=self.bound_similarity)

            if self.threshold_style != 'min':
                for c in self.clusters:
                    if c['sig']:
                        c['sig'].set_threshold()

            sigs = []
            for c in self.clusters:
                if len(c['samples']) >= self.min_cluster_size:
                    sigs.append(c['sig'])
            return sigs
Example #40
0
def test(count=1300, k=3, mx=True, fc_done=None, nxt=False):
    global dts

    #dts = [[tdt[0] **2,tdt[1]**2] for tdt in dts]
    #dt.show_pt(dts,c=dt.color(0), s= 1)
    if fc_done is None:
        fc_done = cut_done  # cluster.DefaultDone(count)

    rst = cluster.k_means(dts, k, fc_done, cluster.default_dst, mx)

    avgs = rst[0]
    dtss = rst[1]
    #avgs = rst[2]
    print len(avgs), len(dtss)
    for i in xrange(len(avgs)):
        avg = avgs[i]
        dt.draw([avg], c=dt.color(i), s=30)
        tdts = dtss[i]
        print len(tdts)
        if len(tdts) == 0:
            continue
        dt.draw(tdts, c=dt.color(i), s=1)
    dt.show()
    if not nxt:
        return None
    #return None
    tree = cluster.cluster(dts, k, cluster.min_dsts)
    print "done tree"
    sets = cluster.tree2sets(tree)
    #sets = cluster.cut_tree(tree,k,cluster.min_cost)
    print "done cut"
    print len(sets)
    for i in xrange(len(sets)):
        tdts = sets[i]
        print len(tdts)
        if len(tdts) == 0:
            continue
        dt.draw(tdts, c=dt.color(i), s=1)
    dt.show()
    return sets
Example #41
0
    def construct_confidence_vectors_dataset(self, reduced_docs, features, session):
        from cluster import cluster
        conf_dataset = SupervisedDataSet(len(features), 1)

        confidence_vectors = []
        num_trues = 0
        for feature in features:
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confidences = cluster("outlier", 2, feature_vectors, center_at_mean=True, num_to_ignore=1, impurity=.2)
                for i, confidence in enumerate(confidences, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([[], 0])
                    if doc.span_is_plagiarized(doc._spans[i]):
                        t = 1
                        num_trues += 1
                    else:
                        t = 0
                    confidence_vectors[vi][0].append(confidence)
                    confidence_vectors[vi][1] = t
                    vi += 1

        num_plagiarised = num_trues / len(features)
        print num_plagiarised

        shuffle(confidence_vectors)
        for vec in confidence_vectors:
            if vec[1] == 0:
                num_plagiarised -= 1
            if not (vec[1] == 0 and num_plagiarised <= 0):
                conf_dataset.addSample(vec[0], vec[1])

        f = open(self.dataset_filepath, 'wb')
        cPickle.dump(conf_dataset, f)
        print 'dumped dataset file'

        return conf_dataset
Example #42
0
def MProgram(Engine, Load, Key, Length, Put):
	print u'基本模块加载完成,您搜索的是:', Key
	iey=Key.decode("GB18030").encode("UTF-8")
	from search import search
	group = search(Engine, iey, Length)
	key = ['' for col in range(len(group[0]))]

	if (Load=='Goose'): from load_Goose import load
	else: from load_BeautifulSoup import load
	ret = load(group[0],4)
	
	from cluster import cluster
	type=cluster(ret[0])
	Key=Key.decode("GB18030")

	if Put=="SQL":
		from sql import input
		input(type,ret[1],group[1], Engine, Load, Key, Length)
		print u'结果已输出至数据库'
	elif Put=="Text":
		from output import output
		output(type,ret[1],group[1], Engine, Load, Key, Length)
		print u'结果已输出至 results\Index_'+Engine+'_'+Load+'_'+Key+'_'+Length+'.htm'
Example #43
0
def run(filename, min_size, max_size, dist):
    noun_dict = noun_extractor.get_nouns(filename)
    (synset_list, synset_dict) = cluster.get_synset_list(noun_dict)
    matrix = cluster.gen_sim_matrix(synset_list)
    clustering = cluster.format_clustering(cluster.cluster(matrix),
                                           synset_list)
    clusters = cluster.get_clusters(clustering,
                                    synset_list,
                                    min_size,
                                    max_size,
                                    dist=dist)
    clusters = filter(lambda x: x[0] is not None, clusters)
    cluster_counts = cluster.get_cluster_counts(clusters, synset_dict)
    # sort clusters by noun counts, most frequent first
    sorted_clusters = [
        x[1] for x in sorted(enumerate(clusters),
                             key=lambda x: cluster_counts[x[0]],
                             reverse=True)
    ]
    hypernyms = filter(lambda x: x is not None,
                       map(lambda x: lca(x), sorted_clusters))
    return Classification(noun_dict, synset_list, synset_dict, matrix,
                          clustering, sorted_clusters, hypernyms)
Example #44
0
def main():
	data_from_pickle = 0
	projection_from_pickle = 1
	clusters_from_pickle = 1

	if data_from_pickle:
		uber_df, street_df, street_graph, node_coord_dict, coord_node_dict, \
			edge_dict, coord_lookup, transition_graph, trans_edge_dict, \
			edge_trans_dict, trans_dict = load_data.from_pickle()
	else:
		uber_df, street_df, street_graph, node_coord_dict, coord_node_dict, \
			edge_dict, coord_lookup, transition_graph, trans_edge_dict, \
			edge_trans_dict, trans_dict = load_data.load_fresh()

	''' apply Kalman filter first pass here, fix large errors '''

	if projection_from_pickle:
		print 'reading in projected uber_df from pickle...'
		uber_df = pickle.load(open('../pickles/uber_df_projected.pkl'))
		print 'read projected uber_df'
	else:
		uber_df = project.project(uber_df, street_graph, transition_graph, 
			node_coord_dict, edge_dict, edge_trans_dict, coord_lookup)
		pickle.dump(uber_df, open('../pickles/uber_df_projected.pkl','wb'))

	''' apply Kalman filter second pass here? fix small errors and re-project onto edges? '''

	if clusters_from_pickle:
		uber_df, centroids = cluster.from_pickle()
	else:
		uber_df, centroids = cluster.cluster(uber_df)

	transition_graph = cluster_graphs.preadjust_transweights(uber_df, edge_dict, transition_graph)
	pickle.dump(transition_graph, open('../pickles/transition_graph_update.pkl','wb'))
	cgraphs = cluster_graphs.make_cluster_graphs(centroids, 
		transition_graph, uber_df, edge_dict)
	pickle.dump(cgraphs, open('../pickles/cgraphs.pkl','wb'))
Example #45
0
    def filterClusters(self):
        if len(self.pairwiseDict)==0:
            self.pairwise()
        amap = AAmap()

        for i in xrange(0,len(self.atoms)):
            c=cluster(self.pdb, self.top, self.pfam, '', '', self.seqheader, '', '', self.center, self.cutoff, self.scutoff, self.flag, 1.0, self.desc)
            c.addNeighbor(amap, self.atoms[i],i) # put itself in first
            nbnum=0
            for j in xrange(0,len(self.atoms)):
                key= "%d-%d" % (i, j)
                if (self.pairwiseDict[key] <= self.cutoff) and (abs(i-j) >= self.scutoff):
                    c.addNeighbor(amap, self.atoms[j], j)
                    nbnum=nbnum+1
                    c.thetaPhi.append(self.calculateThetaPhi(self.atoms[i], self.atoms[j]))
            if nbnum<self.nbcutoff: 
                continue
                
            c.pdbidx=c.pdbidx.lstrip() # will change meanDist
            c.pdbResSeq=c.pdbResSeq.lstrip()
            meanDist = self.clusterMeanDist(c)
            if meanDist < 5.8:
                print ('%s,%0.2f,%s,%s,%s,%s') % (self.pdb, meanDist, ''.join(sorted(c.str)), ''.join(sorted(c.typeStr)), c.pdbResSeq, self.getSphericalStr(c))
                self.clusters.append(c)
from xlwt import *
from re import *
import re
import os
import shutil
import textExtractor
import cluster

# input
# the path where the jobs would lie should be announced
clusterName = 'Tsinghua100'
clusterPath = '/WORK/newGroupAdditivityFrog2/banana/validation_g_M06'
jobsPerSlot = 12

# constants
cluster1 = cluster.cluster(clusterName, clusterPath)
cluster1._g09D01=True 

pattern_logFile = re.compile('^(C[0-9]*H[0-9]*_*[0-9]*_+[r0-9]+_+[CO0-9]+).*\.log$')
pattern_fileConf = re.compile('^(C[0-9]*H[0-9]*_[0-9]*_[0-9]+)_[0-9]+_.*$')
pattern_gjfFile = re.compile('^(C[0-9]*H[0-9]*_*[0-9]*_+[r0-9]+_+[CO0-9]+).*\.gjf$')
pattern_multi = re.compile('^.*Multiplicity = ([0-9]+).*$')
pattern_optimized = re.compile('^.*Optimized Parameters.*$') 
pattern_standard = re.compile('^.*Standard orientation:.*$') 
pattern_input = re.compile('^.*Input orientation:.*$') 
pattern_endline = re.compile('^.*---------------------------------------------------------------------.*$')
# pattern_energy = re.compile('^.*Sum of electronic and zero-point Energies= *(-?[0-9]+\.[0-9]+).*$')
pattern_energy = re.compile('^.*SCF Done:  E\([RU]B3LYP\) = *([\-\.0-9Ee]+) +A\.U\. after.*$')
pattern_end = re.compile('^.*Normal termination of Gaussian 09.*$')

# variables
import matplotlib as mpl
import BUM
import neuropower
import cluster
import peakdistribution
import simul_multisubject_fmri_dataset
import model

EXDIR = sys.argv[1]
FIGDIR = sys.argv[2]

exc = 2

maskfile = os.path.join(EXDIR,"Mask.nii")
SPM = nib.load(os.path.join(EXDIR,"Zstat1.nii")).get_data()
peaks = cluster.cluster(SPM,exc)

# compute P-values

pvalues = np.exp(-exc*(np.array(peaks.peak)-exc))
pvalues = [max(10**(-6),t) for t in pvalues]
peaks['pval'] = pvalues

# estimate model

bum = BUM.bumOptim(peaks['pval'].tolist(),starts=10)
modelfit = neuropower.modelfit(peaks.peak,bum['pi1'],exc=exc,starts=10,method="RFT")

# predict power

thresholds = neuropower.threshold(peaks.peak,peaks.pval,FWHM=8,mask=nib.load(maskfile),alpha=0.05,exc=exc)
Example #48
0
import numm
import random

R = 44100
# PADDING = R / 4                 # frames between segments
# SOURCE = 'snd/Dance_A.wav'
SOURCE = 'snd/Duran_A.wav'
NBINS = 50

cur_cluster = 0
cluster_idx = 0
paused = False
frame_idx = 0

audio = numm.sound2np(SOURCE)
clusters = cluster.cluster(SOURCE, NBINS)

for c in clusters.values():
    random.shuffle(c)

def get_segment(cluster, idx):
    idx = idx % len(clusters[cluster])
    start, duration = clusters[cluster][idx]
    return audio[int(R*start):int(R*(start+duration))]

def audio_out(a):
    global frame_idx, cluster_idx, paused

    if paused:
        paused = False
        return
__author__ = 'Nhuy'

import sys
from cluster.cluster import *

cluster(sys.argv[1], sys.argv[2], int(sys.argv[3]))
Example #50
0
def main(argv):
	###########################################################################
	# figure out the input
	###########################################################################
	try:
		opts, args = getopt.getopt(argv, "hptk:d:s:l:")
	except getopt.GetoptError:
		print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\
				"[-d downsample rate] [-s save file] [-l load file] directory"
		sys.exit(2)

	# initial parameters for specific methods
	show = False
	trans = False
	k = 4
	down = 1
	save = None
	load = None
	ml = None

	# options
	for opt, arg in opts:
		# help function
		if opt == '-h':
			print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\
				"[-d downsample rate] [-s save file] [-l load file] directory"
			sys.exit()

		# show pictures of clustering and classification
		elif opt == '-p':
			show = True

		# translate NITFs
		elif opt == '-t':
			trans = True

		# number of clusters
		elif opt == '-k':
			if int(arg) <= 0:
				print "Error: k must be positive"
				sys.exit(2)
			k = int(arg)

		# downsample rate
		elif opt == '-d':
			if int(arg) < 0:
				print "Error: downsample rate cannot be negative"
				sys.exit(2)
			down = int(arg)

		# save file
		elif opt == '-s':
			if arg == '':
				print "Error: save file must have a name"
				sys.exit(2)
			save = arg

		# load file
		elif opt == '-l':
			if arg == '':
				print "Error: load file must have a name"
				sys.exit(2)
			load = arg

		# unhandled option
		else:
			assert False, "Error: unhandled option"

	# image directory
	if len(args) != 1:
		print "Usage: main.py [-h] [-p] [-t] [-k #clusters] "\
				"[-d downsample rate] [-s save file] [-l load file] directory"
		sys.exit(2)
	folder = args[0]

	###########################################################################
	# translate the NITFs to TIFs if needed
	###########################################################################
	if trans:
		# translate
		high = translate(folder)

		# give user time to make first batch of training/testing data
		print "Waiting for user to draw first batch of training/testing data."
		raw_input("Press Enter to continue...")
		
		# cluster the TIFs
		allImages = cluster(folder, high, show=True)

		# classify crop fields
		allImages, results, ml = classify(folder, allImages, high, k=k, \
																down=down)

		# save the file
		if save is not None:
			saveImages(folder, allImages, save)

		# if the classification is satisfactory return results. Otherwise, 
		# unsatisfactory results so create masks.
		answer = raw_input("Is this classification satisfactory? y/n\n")
		while True:
			if answer == 'y':
				return results
			elif answer == 'n':
				createMask(folder, allImages)
				break
			else:
				answer = raw_input("Please input 'y' or 'n' and press Enter\n")

	###########################################################################
	# or if there is a save file ready for use
	###########################################################################
	elif load is not None and save is None:
		# load the image list
		allImages = loadImages(folder, load)

		# print the error rates from before the save
		for img in allImages:
			# dont print anything if no error rates are calculated
			if np.sum(img.error) == 0.0:
				continue

			# print error rates for each cluster
			print "Error rates of "+img.name+": "
			for i in xrange(k):
				print "cluster"+str(i)+": "+str(img.error[i])

			# total error rate for image
			totalError = np.sum(img.error)
			print "Total error rate: " + str(totalError) + '\n'

	###########################################################################
	# or if there is not a save file ready, and/or translation is not required
	###########################################################################
	else:
		# give user time to make first batch of training/testing data
		print "Waiting for user to draw first batch of training/testing data."
		raw_input("Press Enter to continue...")

		# cluster the TIFs
		allImages = cluster(folder, show=show, k=k, down=down)
	
		# classify crop fields
		allImages, results, ml = classify(folder, allImages, k=k, down=down, \
											show=show)

		# save the files
		if save is not None:
			saveImages(folder, allImages, save)

		# if the classification is satisfactory return results. Otherwise, 
		# unsatisfactory results so create masks.
		answer = raw_input("Is this classification satisfactory? y/n\n")
		while True:
			if answer == 'y':
				return results
			elif answer == 'n':
				createMask(folder, allImages)
				break
			else:
				answer = raw_input("Please input 'y' or 'n' and press Enter\n")
	
	###########################################################################
	# iteratively classify new training data based on errors
	###########################################################################
	while True:
		# calculate new number of polygons required for each cluster
		image.calculatePolygons(allImages, k, N=20)

		# wait for the user to create more T and Q files based on calculations
		print "Waiting for user to draw new training/testing data."
		raw_input("Press Enter to continue...")

		# reclassify
		if trans:
			allImages, results, ml = classify(folder, allImages, high, k=k, \
												ml=ml, show=show)
		else:
			allImages, results, ml = classify(folder, allImages, k=k, ml=ml,\
												show=show)

		# if classification is satisfactory, then return results
		answer = raw_input("Is this classification satisfactory? y/n\n")
		while True:
			if answer == 'y':
				return results
			elif answer == 'n':
				break
			else:
				answer = raw_input("Please input 'y' or 'n' and press Enter\n")
Example #51
0
def main(fileNames, target='bank'):
    maxFeatures = 10
    windowSize = 20

    freqCount = {}
    targetAppearances = []
    wordVectors = {}
    
    ##
    ##  Choose featureset by analyzing data.
    ##  First go through dataset and create a list of
    ##  TargetAppearance objects
    ##
    
    #filePathPrefix = basedir #'.'
    #try:
    #    os.chdir(filePathPrefix)
    #except OSError, e:
    #    print e
    #    
    #fileNames = os.listdir(os.curdir)
    
    ##
    ##  perform the analysis for each datafile in the directory
    ##  during this step we also collect all the TargetAppearance
    ##  objects and contextWindows
    ##
    startTime = time.time()
    for file in fileNames:
        if not file.endswith('.txt'): continue
        print 'opening file:  ', file
        f = open(file, 'r')
        targetAppearances = findAppearances(f, target, windowSize,
                                            targetAppearances)
        f.close()      ##  play nice with others...
    
    stopTime = time.time()
    print ('**Training Phase --> datafile analysis time:  %.3f seconds.' %
           (stopTime - startTime))
    print ('**Training Phase --> found ', len(targetAppearances),
           ' instances of target word:  ', target)
    ##  once we have analyzed all the data files, we analyze to choose
    ##  the features (# of dimensions) of the word vector space

    freqCount = analyzeLocalDistribution(f, target, windowSize,
                                         targetAppearances)

    ##  freqList is a purely frequency driven selection
    if freqCount:
        featureList = freqBasedLocalSelection(freqCount, maxFeatures)
    else:
        print 'No occcurrences of target word:  ', target, '.'

    ##
    ##  word vector creation
    ##

    startTime = time.time()
    for file in fileNames:
        if not file.endswith('.txt'): continue
        tempWordVectors = {}
        f = open(file, 'r')
        print 'wordVector:  opening ', file
        ##  derive the word vectors for that file
        tempWordVectors = createWordVector(f, windowSize, featureList)
        f.close()
        ##  add the resulting vectors to our total
        if tempWordVectors:
            if wordVectors:
                for key,val in tempWordVectors.items():
                    sumVectors(wordVectors[str(key)],
                               tempWordVectors[str(key)])
            else:
                wordVectors = tempWordVectors
  
    stopTime = time.time()
    print ('**Training Phase --> word vector creation time:  %.3f seconds.' %
           (stopTime - startTime))

    ##
    ##  context vector creation
    ##

    for tA in targetAppearances:
        tA.setContextVector(wordVectors)

    ##
    ##  sense cluster creation
    ##
    print '**Training Phase --> sense clustering'
    senses = []
    senses = cluster.cluster(targetAppearances)

    ###################################
    ##
    ##  testing phase
    ##
    ###################################

    tempWordVectors = {}
    testWordVectors = {}
    testAppearances = []
    startTime = time.time()
    for file in fileNames:
        if not file.endswith('.tst'): continue
        print 'opening file:  ', file
        f = open(file, 'r')
        ##  first find the appearances of the target word in the text
        testAppearances = findAppearances(f, target, windowSize,
                                          testAppearances)
        tempWordVectors = createWordVector(f, windowSize, featureList)
        f.close()      ##  play nice with others...
        if tempWordVectors:
            if wordVectors:
                for key,val in tempWordVectors.items():
                    testWordVectors = sumVectors(testWordVectors[str(key)],
                                                 tempWordVectors[str(key)])
    stopTime = time.time()

    print ('**Testing Phase --> file analysis time:  %.3f seconds.' %
           (stopTime - startTime))

    ##
    ##  context vector creation
    ##

    for tA in testAppearances:
        tA.setContextVector(wordVectors)
Example #52
0
def ksplit(data_list,ks_crit):
    
    centroids,labels_init = kmeans2(data_list,2,minit='points')
    count = 1
    labels = labels_init
    cont = ones(1)
    llist = zeros((5000,5000))
    mask = where(llist == 0)
    llist[mask] = -1
    list_list = [2,4,8,16,32,64,128,256,512,1024,2048,2**12]

    l = 0
    for w in range(len(list_list)):
        for k in range(list_list[w]):
            llist[w,k] = l
            l += 1
    j = 0
    p = 2

    # Runs until the sum of cont == 0, when cont > 1 this means that there are still clusters                                      
    # to be clustered                                                                                                         

    while sum(cont) >= 1 :
        cont = zeros(2**count)
        m = 0

        for i in llist[j]:
            # Using the -1 mask on list this allows for the preset cluster #'s to reset.                                                                                                                                                  
            if i == -1:
                break
            # Sets r = positions of ith cluster
            r = data_list[labels_init == i]

            if len(r) == 0:
                continue
            # Feeds the positions of the ith cluster to ks_means which splits the                                                              
            # data into 2 further clusters, then the KS test is used and a p-val                                                               
            # is returned.                                                                                                         
                                                                                                                                
            ks,centroids_,labels = cluster(r)
            
            if ks > ks_crit:
                # This is the condition for a good cluster
                continue
            elif ks == -1:
                break
            else:
                # Re-labels the bifurcated clusters because of non-gaussianity
                a = where(labels_init == i)
                for w in range(2):
                    b = where(labels == w)
                    labels[b] = p
                    p += 1
                    
                centroids = vstack((centroids,centroids_))
                labels_init[a] = labels
                cont[m] = 1
            m += 1

        count += 1
        j += 1

    no_delete = sort(list(set(labels_init)))
    reverse_count = range(max(no_delete)+1)
    for k in reverse_count[::-1]:
        for n in no_delete:
            kill = 1
            if k == n:
                break
            else:
                if kill == len(no_delete):
                    centroids = delete(centroids,k,0)
                kill += 1
                continue
    
    return centroids,labels_init,ks_crit
Example #53
0
#get some help!

if sys.argv[-1] == '--help':
    print("""Usage:\n
\tmakeSnpFile.py inputFile.out --option\n
Options:
--help\t\tget this menu\n--snpdata\tprint out information about all snps
--stats\t\tget stats for each cluster
--dadi\t\tprint a formatted dadi snp file to stdout (use > file.txt to save to a file)""")


#just for fun, a way to print out data for all snps
if sys.argv[-1] == '--snpdata':
    for i in goodData:
        print(cluster(i).snpData)

#and for the 'stats' line
if sys.argv[-1] == '--stats':
    for i in goodData:
        print(cluster(i).stats)

header = "Inpop\tOutpop\tAllele1\t1\t2\t3\tAllele2\t1\t2\t3\tClstr\tPosition"

if sys.argv[-1] == '--dadi':
    print(header)
    for i in goodData:
        currentCluster = cluster(i).outputDadi()
        if currentCluster != None:
            print(currentCluster)
Example #54
0
    def train(self, pos_samples):
        if self.tokenize_all:
            pos_samples = self._tokenize_samples(pos_samples)

        if self.do_cluster:
            def sig_gen_cb(left, right):
                lsig = left['sig']
                rsig = right['sig']

                # tokenize if possible
                if not lsig and not rsig and self.tokenize_pairs:
                    lsig = pos_samples[left['samples'][0]]
                    rsig = pos_samples[right['samples'][0]]
                    (lsig, rsig) = self._tokenize_samples([lsig, rsig])
                else:
                    if lsig:
                        lsig = lsig.lcs
                    else:
                        lsig = list(pos_samples[left['samples'][0]])
                    if rsig:
                        rsig = rsig.lcs
                    else:
                        rsig = list(pos_samples[right['samples'][0]])

                # find the common subsequence
                lcs = self._find_lcs(lsig, rsig)
                t = self._lcs_to_tuple(lcs)
                sig = TupleSig(lcs, t)
#                print self._lcs_to_regex(sig)

                # calculate a score for the resulting signature
                scores = []
                for token in t:
#                    prob = sigprob.regex_prob(token, 1000, stats=self.statsfile)[-1]
                    prob = sig_gen.est_fpos_rate(token, self.fpos_training_streams)
                    scores.append(- math.log(prob + 1e-300)/math.log(10))

                # using all the token scores overly favors signatures
                # with many tokens. Current fix is to only use most distinctive
                # tokens to calculate the score.
                if self.max_tokens_in_est:
                    scores.sort(lambda x,y: cmp(y,x))
                    score = sum(scores[:self.max_tokens_in_est])
                else:
                    score = sum(scores)
                return (sig, score)

            import cluster
            clusters = cluster.cluster(sig_gen_cb, self.spec_threshold, 
                              pos_samples, max_fp_count = self.max_fp_count,
                              fpos_training_streams=self.fpos_training_streams,
                              min_cluster_size=self.min_cluster_size,
                              bound_similarity=self.bound_similarity)

            # return the tuple signatures for the final clusters
            self.tuple_list = []
            sigs = []
            for c in clusters:
                if len(c['samples']) >= self.min_cluster_size:
                    self.tuple_list.append(c['sig'].tuplesig)
                    sigs.append(c['sig'])
            self.clusters = clusters
            return sigs

        else:
            # Find a subsequence common to all the samples
            self.lcs = pos_samples[0]
            for sample in pos_samples[1:]:
                self.lcs = self._find_lcs(self.lcs, sample)

            # Return the final signature
            regex_string = self._lcs_to_regex(self.lcs)
            if self.use_fixed_gaps:
                return [RegexSig(self._lcs_to_regex(self.lcs))]
            else:
                return [TupleSig(self.lcs, self._lcs_to_tuple(self.lcs))]
Example #55
0
		x = self.Y[:,0]
		y = self.Y[:,1]		
		assert len(self.UP_pages.pages) == x.size
		for i in range(x.size):
			write_file.write(self.filename2Url(self.UP_pages.pages[i].path)+"\t"+ str(group_list[i]) +"\t"+str(x[i])+"\t"+str(y[i])+"\n")
	
	def filename2Url(self,filename):
		return filename.replace("_","/")


if __name__=='__main__':
	#UP_pages = allPages(["../Crawler/toy_data/users/","../Crawler/toy_data/questions/","../Crawler/toy_data/lists/"])
	#UP_pages = allPages(["../Crawler/crawl_data/Users/","../Crawler/crawl_data/Outlinks_U/","../Crawler/crawl_data/Noise/"])
	UP_pages = allPages(["../Crawler/crawl_data/Questions/"])
	v = visualizer(UP_pages)
	user_group = cluster()
	for i in range(len(UP_pages.ground_truth)):
		if UP_pages.ground_truth[i] == 1:
			page = UP_pages.pages[i]
			user_group.addPage(page)
	global_threshold = len(UP_pages.pages) * 0.9
	print len(user_group.pages)
	user_group.find_local_stop_structure(UP_pages.nidf,global_threshold)

	v.show(v.UP_pages.ground_truth,"ground_truth.test")
	
	'''
	UP_pages = allPages(["../Crawler/crawl_data/Questions/"])
	feature_matrix = []
	for page in UP_pages.pages:
		tfidf_vector = []
Example #56
0
def problem3(docs):
    doc_cluster = cluster(docs)

    predictions = doc_cluster.hierarchical()

    return predictions
Example #57
0
# encoding: utf-8
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/lib')

import graphs
import cluster
import hamming

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "usage: ./homework2.py [1|2|3] <inputfile>"
        sys.exit(0)
    question = sys.argv[1]
    filename = sys.argv[2]
    if len(sys.argv) > 3:
        param = sys.argv[3]
    if question == '1':
        graph = graphs.parse(filename, undirected=True)
        mindist, clust = cluster.cluster(graph, int(param))
        for key,values in clust.items():
            print "{}:".format(key)
            for v in values:
                print "\t{}".format(v)
        print "Shortest distance: {}".format(mindist)
    if question == '2':
        vertices, radix = hamming.parse(open(filename, 'r'))
        clustering = hamming.cluster(vertices, radix)
        print "Found {} clusters".format(clustering.clusters)
Example #58
0
spm = spm-enn.mu

ps = spm.flatten()
ps = [x for x in ps if x == x]


xn = np.arange(-10,10,0.01)
twocol = Paired_12.mpl_colors
plt.figure(figsize=(7,5))
plt.hist(ps,lw=0,facecolor=twocol[0],normed=True,bins=np.arange(-2,10,0.3),label="observed distribution")
plt.xlim([-2,10])
plt.ylim([0,0.5])
plt.plot(xn,stats.norm.pdf(xn),color=twocol[1],lw=3,label="null distribution")
plt.show()

peaks = cluster.cluster(spm)
peaks['pval'] = peakdistribution.peakp(peaks.peak.tolist())
bum = BUM.bumOptim(peaks["pval"].tolist(),starts=10)
modelfit = neuropower.TFpeakfit(peaks['peak'].tolist(),bum['pi1'])


xn = np.arange(-10,10,0.01)

twocol = Paired_12.mpl_colors
plt.figure(figsize=(7,5))
plt.hist(peaks['peak'].tolist(),lw=0,facecolor=twocol[0],normed=True,bins=np.arange(-2,10,0.3),label="observed distribution")
plt.xlim([-2,10])
plt.ylim([0,0.5])
plt.plot(xn,[(1-bum["pi1"])*peakdistribution.peakdens3D(p,1) for p in xn],color=twocol[3],lw=3,label="null distribution")
plt.plot(xn,[bum["pi1"]*peakdistribution.peakdens3D(p-modelfit['delta'],1) for p in xn],color=twocol[5],lw=3,label="alternative distribution")
plt.plot(xn,neuropower.mixprobdens(modelfit["delta"],bum["pi1"],xn),color=twocol[1],lw=3,label="fitted distribution")
        x2 = randn(members)+i
        y2 = randn(members)

        x = append(x1,x2)
        y = append(y1,y2)

        data_list = []

        for m in range(len(x)):
            data = array([x[m],y[m]])
            data_list.append(data)

        data_list = vstack(data_list)
        ks = kstest(x,'norm')
        ks1 = kstest(y1,'norm')
        ks,ad,cent,labels = cluster(data_list)
        KSstat = append(KSstat,ks)
        ADstat = append(ADstat,ad)
    
    AD_mu = mean(ADstat)
    KS_mu = mean(KSstat)
    
    ADmean = append(ADmean,AD_mu)
    KSmean = append(KSmean,KS_mu)
    
ADmean = ADmean/max(abs(ADmean))
KSmean = KSmean/max(abs(KSmean))

pylab.figure()
pylab.subplot(2,1,1)
pylab.plot(dist,ADmean,label='AD')