Exemple #1
0
def update(individual_classifications):
    #start by removing all temp files
    try:
        os.remove("/home/greg/Databases/condor_ibcc.out")
    except OSError:
        pass

    try:
        os.remove("/home/greg/Databases/condor_ibcc.mat")
    except OSError:
        pass

    try:
        os.remove("/home/greg/Databases/condor_ibcc.csv.dat")
    except OSError:
        pass

    with open("/home/greg/Databases/condor_ibcc.csv","a") as f:
        for u, s, b in individual_classifications:
            f.write(str(u)+","+str(s)+","+str(b)+"\n")


    print datetime.datetime.time(datetime.datetime.now())
    ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py")
    print datetime.datetime.time(datetime.datetime.now())
def __ibcc__2(results_dict,users_per_subject):
    # results needs to be a dictionary which maps from a subject ID to a list of found clusters
    # users_per_subject needs to be different, just in case someone clicked on nothing
    # get a list of all of the users
    assert type(users_per_subject) == dict
    global_users = []
    for u_list in users_per_subject.values():
        for u in u_list:
            if not(u in global_users):
                global_users.append(u)

    things_in_subject = {}
    off_by_one_clusters = []
    things_list = []
    thing_index = 0

    for zooinverse_id in results_dict:
        things_in_subject[zooinverse_id] = []

        centers,clusters,users = results_dict[zooinverse_id]
        pairs = __find_closest__(centers,clusters,users,user_threshold=1,offset=thing_index)
        off_by_one_clusters.extend(list(pairs))

        for users_per_marking in users:
            things_in_subject[zooinverse_id].append(thing_index)

            # find out who saw or did not see this "thing" - out of everyone who viewed this subject
            t = []
            for u in users_per_subject[zooinverse_id]:
                if u in users_per_marking:
                    t.append((global_users.index(u),1))
                else:
                    t.append((global_users.index(u),0))

            things_list.append(t[:])
            thing_index += 1

    # run ibcc with out combining any of the clusters
    with open(base_directory+"/Databases/base_ibcc.csv","wb") as f:
        f.write("a,b,c\n")
        for thing_index in range(len(things_list)):
            for user_index, marked in things_list[thing_index]:
                f.write(str(user_index)+","+str(thing_index)+","+str(marked)+"\n")
    __ibcc_init__("base")
    ibcc.runIbcc(base_directory+"/Databases/base_ibcc.py")

    confusions = []
    with open(base_directory+"/Databases/base_ibcc.mat") as f:
        for user_index, l in enumerate(f.readlines()):
            confusions.append([float(f) for f in l[:-1].split(" ")])

    for count,(c1,c2,overlap) in enumerate(off_by_one_clusters):
        print things_list[c1]
        print things_list[c2]

        users = zip(*things_list[c1])[0]
        for u in users:
            print confusions[u][2],confusions[u][3]

        break
Exemple #3
0
    def __ibcc__(self):
        for species in self.speciesList:
            #check to see whether or not this file exists
            if not(os.path.isfile(self.baseDir+"ibcc/"+species+"_ibcc.out"+str(self.cutOff))):
                ibcc.runIbcc(self.baseDir+"ibcc/"+str(species)+str(self.cutOff)+"config.py")



#i = IBCCsetup()
#i.__createConfigs__()
#i.__filterUserClassifications__()
#i.__ibcc__()
Exemple #4
0
    def __classify__(self):
        userNames = self.userDict.keys()
        subjectNames = self.subjectDict.keys()

        f = open(self.baseDir+"ibcc/input",'wb')
        for u in self.userDict:
            classifications = self.userDict[u].__getClassifications__()

            for (s,r) in classifications:
                f.write(str(userNames.index(u)) + "," + str(subjectNames.index(s)) + "," + str(r) + "\n")
        f.close()

        #now - write the config file
        f = open(self.baseDir+"ibcc/config.py",'wb')
        f.write("import numpy as np\nscores = np.array([0,1])\n")
        f.write("nScores = len(scores)\n")
        f.write("nClasses = 2\n")
        f.write("inputFile = '"+self.baseDir+"ibcc/input'\n")
        f.write("outputFile =  '"+self.baseDir+"ibcc/output'\n")
        f.write("confMatFile = '"+self.baseDir+"ibcc/confusion'\n")
        #f.write("nu0 = np.array([45.0,55.0])\n")
        f.close()

        ibcc.runIbcc(self.baseDir+"ibcc/config.py")
Exemple #5
0
    pass

try:
    os.remove(data_directory + "/galaxy_zoo_ibcc.mat")
except OSError:
    pass

try:
    os.remove(data_directory + "/galaxy_zoo_ibcc.csv.dat")
except OSError:
    pass

import datetime
print datetime.datetime.time(datetime.datetime.now())
print base_directory + "/Databases/galaxy_zoo_ibcc.py"
ibcc.runIbcc(data_directory + "/galaxy_zoo_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())

#read in gold standard data
pos0 = []
pos1 = []
pos2 = []
with open(data_directory + "/candels_t01_a00_positive.dat", "rb") as f:
    for l in f.readlines():
        pos0.append(l[:-1])
with open(data_directory + "/candels_t01_a01_positive.dat", "rb") as f:
    for l in f.readlines():
        pos1.append(l[:-1])
with open(data_directory + "/candels_t01_a02_positive.dat", "rb") as f:
    for l in f.readlines():
        pos2.append(l[:-1])
Exemple #6
0
    def __signal_ibcc_majority__(self,split_ip_address=True):
        """
        run ibcc to determine which clusters are signal or noise

        use majority voting to determine priors

        :param split_ip_address: for user ids which are ip addresses - when people are not logged in - should be
        treat each subject completely separate. That is if ip address X marked subjects A and B should we treat
        X as being two completely different people for those two classifications. There is no guarantee that they
        are the same person but seems like a lot of information to throw away. The param is to allow exploring the
        options and results.
        :return:
        """
        # todo: implement a middle ground for split_ip_address where we treat the same ip address as the same person
        # todo: as long as the classifications are close enough together time wise

        # get all users who have viewed any subjects which are processing - also get the list of those who
        # did so while not logged in
        all_users = list(self.project_api.__all_users__())
        all_ips = list(self.project_api.__all_ips__())

        # global cluster count - across all images/subjects
        cluster_count = -1

        # need to give the ip addresses unique indices, so update ip_offset after every subject
        ip_offset = 0

        # needed for determining priors for IBCC
        real_animals = 0
        fake_animals = 0

        # needed for prior confusion matrix
        true_pos = []
        true_neg = []

        # intermediate holder variable
        # because ibcc needs indices to be nice and ordered with no gaps, we have to make two passes through the data
        to_ibcc = []

        # for each global cluster index, store what image/subject it is from and what its local index is
        # wrt to that subject
        self.global_to_local = []

        # print out the classifications and set up the priors using majority voting
        for zooniverse_id in self.clusterResults:
            if self.clusterResults[zooniverse_id] is None:
                continue

            # get the list of all the users who viewed this subject
            # and the ip addresses of every user who was not logged in while viewing the subjects
            users_per_subject = self.project_api.__users__(zooniverse_id)
            ips_per_subject = self.project_api.__ips__(zooniverse_id)

            # process each cluster (possible animal), one at a time
            # only the names of users who marked this cluster matter - the specific x,y points are irrelevant right now
            for local_index,user_per_cluster in enumerate(self.clusterResults[zooniverse_id][2]):
                # moving on to the next animal so increase counter
                # universal counter over all images
                cluster_count += 1

                # needed for determining priors for IBCC
                pos = 0
                neg = 0

                # note that the cluster with index cluster_count is from subject zooniverse_id
                self.global_to_local.append((zooniverse_id,local_index))

                # for this cluster, go through each user and see if they marked this cluster
                # check whether or not each user marked this cluster
                for user_id in users_per_subject:
                    # if the user was not logged in
                    if user_id in ips_per_subject:
                        # if we are considering the ip addresses of each user (i.e. those that were not logged in)
                        # separately for each image - assign a user index based only this image
                        # use negative indices to differentiate ip addresses and users
                        # +1 assures that we don't have 0 - which is "both" positive and negative
                        if split_ip_address:
                            user_index = -(ips_per_subject.index(user_id)+ip_offset+1)
                        else:
                            # we are treating all occurances of this ip address as being from the same user
                            user_index = -all_ips.index(user_id)-1
                    else:
                        # user was logged in
                        # todo: use bisect to increase speed
                        user_index = all_users.index(user_id)

                    # did the user mark this cluster or not?
                    if user_id in user_per_cluster:
                        to_ibcc.append((user_id,user_index,cluster_count,1))
                        pos += 1
                    else:
                        to_ibcc.append((user_id,user_index,cluster_count,0))
                        neg += 1

                # if a majority of people say that there is an animal - use this for prior values
                if pos > neg:
                    real_animals += 1

                    # for estimating the confusion matrix
                    true_pos.append(pos/float(pos+neg))
                else:
                    fake_animals += 1

                    true_neg.append(neg/float(pos+neg))

            ip_offset += len(ips_per_subject)

        # now run through again - this will make sure that all of the indices are ordered with no gaps
        # since the user list is created by reading through all the users, even those which haven't annotated
        # of the specific images we are currently looking at
        ibcc_user_list = []

        # this is also for other functions to be able to interpret the results
        self.ibcc_users = []

        for user,user_index,animal_index,found in to_ibcc:
            # can't use bisect or the indices will be out of order
            if not(user_index in ibcc_user_list):
                ibcc_user_list.append(user_index)
                self.ibcc_users.append(user)

        # write out the input file for IBCC
        with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv","wb") as f:
            f.write("a,b,c\n")
            for marking_count,(user,user_index,animal_index,found) in enumerate(to_ibcc):
                if marking_count == 200:
                    break
                i = ibcc_user_list.index(user_index)
                f.write(str(i)+","+str(animal_index)+","+str(found)+"\n")

        # create the prior estimate and the default confusion matrix
        prior = real_animals/float(real_animals + fake_animals)

        t = np.mean(true_pos)
        f = np.mean(true_neg)
        # what the weight should be
        # todo: should this be hard coded or set as a param?
        weight = 10

        # the confusion matrix cannot have any zero values
        confusion = [[max(int(t*weight),1),max(int((1-t)*weight),1)],[max(int((1-f)*weight),1),max(int(f*weight),1)]]

        # create the config file
        with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.py","wb") as f:
            f.write("import numpy as np\n")
            f.write("scores = np.array([0,1])\n")
            f.write("nScores = len(scores)\n")
            f.write("nClasses = 2\n")
            f.write("inputFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.csv\"\n")
            f.write("outputFile = \""+self.base_directory+"/Databases/"+self.alg+"_signal.out\"\n")
            f.write("confMatFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.mat\"\n")
            f.write("nu0 = np.array(["+str(max(int((1-prior)*100),1))+","+str(max(int(prior*100),1))+"])\n")
            f.write("alpha0 = np.array("+str(confusion)+")\n")

        # start by removing all temp files
        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_signal.out")
        except OSError:
            pass

        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.mat")
        except OSError:
            pass

        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv.dat")
        except OSError:
            pass

        # pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb"))
        ibcc.runIbcc(self.base_directory+"/Databases/"+self.alg+"_ibcc.py")
Exemple #7
0
    def __signal_ibcc_gold__(self,global_indices,gold_standard_pts,split_ip_address=True):
        """
        uses gold standard from experts instead of priors based on majority voting
        :param global_indices:
        :param split_ip_address:
        :param gold_standard_pts: the list of global indices for which we are going to provide gold standard data
        using a negative index is a way of giving a false positive
        :return:
        """
        # intermediate holder variable
        # because ibcc needs indices to be nice and ordered with no gaps, we have to make two passes through the data
        to_ibcc = []


        # there will be some redundancy reading in the subject list - so keep track of the current subject_id
        # and only update when necessary
        users_per_subject = None
        ips_per_subject = None
        current_subject = None

        # we may skip over some indices if they correspond to gold standard points which no one marked and
        # are not being used as provided gold standard data
        actually_used_clusters = []

        for global_cluster_index,(subject_id,local_index) in enumerate(global_indices):
            # only update when necessary - when we have moved on to a new subject
            if subject_id != current_subject:
                # get the list of all the users who viewed this subject
                # and the ip addresses of every user who was not logged in while viewing the subjects
                users_per_subject = self.project_api.__users__(subject_id)
                # ips_per_subject = self.project_api.__ips__(subject_id)

                current_subject = subject_id

            if local_index is None:
                # in this case, we know that no user marked this animal
                # this is either provided gold standard data, or a test - in which case we should ignore
                # this data
                if not(global_cluster_index in gold_standard_pts):
                    continue
                else:
                    user_per_cluster = []
            else:
                user_per_cluster = self.clusterResults[subject_id][2][local_index]

            actually_used_clusters.append(global_cluster_index)

            for user_id in list(users_per_subject):
                # check to see if this user was logged in - if not, the user_id should be an ip address
                # if not logged in, we just need to decide whether to add the subject_name on to the user_id
                # which as a result treats the same ip address for different subjects as completely different
                # users
                try:
                    socket.inet_aton(user_id)
                    if split_ip_address:
                        user_id += subject_id
                except (socket.error,UnicodeEncodeError) as e:
                    # logged in user, nothing to do
                    pass
                if user_id in user_per_cluster:
                    to_ibcc.append((user_id,global_cluster_index,1))
                else:
                    to_ibcc.append((user_id,global_cluster_index,0))

        # gives each user an index with no gaps in the list
        user_indices = []

        # write out the input file for IBCC
        with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv","wb") as f:
            f.write("a,b,c\n")
            for user,cluster_index,found in to_ibcc:
                if not(user in user_indices):
                    user_indices.append(user)

                i = user_indices.index(user)
                j = actually_used_clusters.index(cluster_index)
                f.write(str(i)+","+str(j)+","+str(found)+"\n")

        #return user_indices,actually_used_clusters
        # write out the input file for IBCC

        # create the config file
        with open(self.base_directory+"/Databases/"+self.alg+"_ibcc.py","wb") as f:
            f.write("import numpy as np\n")
            f.write("scores = np.array([0,1])\n")
            f.write("nScores = len(scores)\n")
            f.write("nClasses = 2\n")
            f.write("inputFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.csv\"\n")
            f.write("outputFile = \""+self.base_directory+"/Databases/"+self.alg+"_signal.out\"\n")
            f.write("confMatFile = \""+self.base_directory+"/Databases/"+self.alg+"_ibcc.mat\"\n")
            f.write("goldFile= \""+self.base_directory+"/Databases/"+self.alg+"_gold.csv\"\n")

        # start by removing all temp files
        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_signal.out")
        except OSError:
            pass

        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.mat")
        except OSError:
            pass

        try:
            os.remove(self.base_directory+"/Databases/"+self.alg+"_ibcc.csv.dat")
        except OSError:
            pass

        # pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb"))
        ibcc.runIbcc(self.base_directory+"/Databases/"+self.alg+"_ibcc.py")

        return self.base_directory+"/Databases/"+self.alg+"_signal.out", user_indices,actually_used_clusters
    os.remove(base_directory+"/Databases/condor_ibcc.out")
except OSError:
    pass

try:
    os.remove(base_directory+"/Databases/condor_ibcc.mat")
except OSError:
    pass

try:
    os.remove(base_directory+"/Databases/condor_ibcc.csv.dat")
except OSError:
    pass

#pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb"))
ibcc.runIbcc(base_directory+"/Databases/condor_ibcc.py")
ibcc_v = []
with open(base_directory+"/Databases/condor_ibcc.out","rb") as f:
    ibcc_results = csv.reader(f, delimiter=' ')

    for row in ibcc_results:
        ibcc_v.append(float(row[2]))

with open(base_directory+"/Databases/condor_ibcc.mat","rb") as f:
    ibcc_results = csv.reader(f, delimiter=' ')

    for row in ibcc_results:
        ibcc_v.append(float(row[2]))

for ii,zooniverse_id in enumerate(results_dict):
    print zooniverse_id
Exemple #9
0
    pass

try:
    os.remove(data_directory+"/galaxy_zoo_ibcc.mat")
except OSError:
    pass

try:
    os.remove(data_directory+"/galaxy_zoo_ibcc.csv.dat")
except OSError:
    pass

import datetime
print datetime.datetime.time(datetime.datetime.now())
print base_directory+"/Databases/galaxy_zoo_ibcc.py"
ibcc.runIbcc(data_directory+"/galaxy_zoo_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())

#read in gold standard data
pos0 = []
pos1 = []
pos2 = []
with open(data_directory+"/candels_t01_a00_positive.dat","rb") as f:
    for l in f.readlines():
        pos0.append(l[:-1])
with open(data_directory+"/candels_t01_a01_positive.dat","rb") as f:
    for l in f.readlines():
        pos1.append(l[:-1])
with open(data_directory+"/candels_t01_a02_positive.dat","rb") as f:
    for l in f.readlines():
        pos2.append(l[:-1])
Exemple #10
0
            f.write(str(user_index)+","+str(subject_index)+","+str(ann)+"\n")

print "number of users " + str(len(user_ids))
print "number of gold labels " + str(len(list(gold_set)))
with open(baseDir+"Databases/supernova_ibcc.py",'wb') as f:
    f.write("import numpy as np\nscores = np.array([0,1])\n")
    f.write("nScores = len(scores)\n")
    f.write("nClasses =2\n")
    f.write("inputFile = '"+baseDir+"Databases/supernova_ibcc.csv'\n")
    f.write("outputFile =  '"+baseDir+"Databases/supernova_ibcc.out'\n")
    f.write("confMatFile = '"+baseDir+"Databases/supernova_ibcc.mat'\n")
    f.write("goldFile = '"+baseDir+"Databases/supernova_ibcc_gold.csv'\n")

os.remove(baseDir+"Databases/supernova_ibcc.csv.dat")

ibcc.runIbcc(baseDir+"Databases/supernova_ibcc.py")
print "done IBCC"
x_values = []
y_values = []
with open(baseDir+"Databases/supernova_ibcc.mat","rb") as f:
    reader = csv.reader(f,delimiter=" ")
    for user_index,r in enumerate(reader):
        count = classification_counts[user_index]

        if min(count) < 5:
            continue

        x = float(r[0])
        y = float(r[-1])

        x_values.append(x)
try:
    os.remove(base_directory+"/Databases/penguins_ibcc.out")
except OSError:
    pass

try:
    os.remove(base_directory+"/Databases/penguins_ibcc.mat")
except OSError:
    pass

try:
    os.remove(base_directory+"/Databases/penguins_ibcc.in.dat")
except OSError:
    pass

ibcc.runIbcc(base_directory+"/Databases/penguins_ibcc_config.py")

print "done that"

total = 0
true_positives = []
false_positives = []
with open(base_directory+"/Databases/penguins_ibcc.out",'rb') as f:
    for l in f.readlines():
        penguin_index, neg_prob,pos_prob = l.split(" ")

        penguin = penguins[max_users][image_index][1][int(float(penguin_index))][0]

        #is this penguin "real" ie. is in the gold standard?
        if cluster_compare(gold_standard,[penguin,]) == []:
            #yes - penguin is real
Exemple #12
0
    f.write("import numpy as np\n")
    f.write("scores = np.array([0,1,2])\n")
    f.write("nScores = len(scores)\n")
    f.write("nClasses = 3\n")
    f.write("inputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.csv\"\n")
    f.write("outputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.out\"\n")
    f.write("confMatFile = \"/home/greg/Databases/galaxy_zoo_ibcc.mat\"\n")
    f.write("nu0 = np.array([40,40,10])\n")
    f.write("alpha0 = np.array([[5, 2, 2], [2, 5, 2], [3, 3, 3]])\n")

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.out")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.mat")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.csv.dat")
except OSError:
    pass

import datetime
print datetime.datetime.time(datetime.datetime.now())
ibcc.runIbcc("/home/greg/Databases/galaxy_zoo_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())

Exemple #13
0
def __ibcc__(results_dict,users_per_subject):
    # create a global index of all users
    global_users = []
    for u_list in users_per_subject.values():
        for u in u_list:
            if not(u in global_users):
                global_users.append(u)

    things_in_subject = {}
    things_list = []

    thing_index = 0
    off_by_one_clusters = []

    for zooinverse_id in results_dict:
        things_in_subject[zooinverse_id] = []

        centers,clusters,users = results_dict[zooinverse_id]
        pairs = __find_closest__(centers,clusters,users,user_threshold=1,offset=thing_index)
        off_by_one_clusters.extend(list(pairs))

        for users_per_marking in users:
            things_in_subject[zooinverse_id].append(thing_index)

            # find out who saw or did not see this "thing" - out of everyone who viewed this subject
            t = []
            for u in users_per_subject[zooinverse_id]:
                if u in users_per_marking:
                    t.append((global_users.index(u),1))
                else:
                    t.append((global_users.index(u),0))

            things_list.append(t[:])
            thing_index += 1

    # run ibcc with out combining any of the clusters
    with open(base_directory+"/Databases/base_ibcc.csv","wb") as f:
        f.write("a,b,c\n")
        for thing_index in range(len(things_list)):
            for user_index, marked in things_list[thing_index]:
                f.write(str(user_index)+","+str(thing_index)+","+str(marked)+"\n")
    __ibcc_init__("base")
    ibcc.runIbcc(base_directory+"/Databases/base_ibcc.py")

    # now try merging each possible pair and running ibcc on the resulting set up
    # yes, this is going to be tedious and time consuming - hope for a better implementation later on
    for count,(c1,c2,overlap) in enumerate(off_by_one_clusters):
        # most of the time, thing_index and thing_prime_index will be the same
        # but can be an off by one indifference to account for that fact that we are skipping over c2
        thing_prime_index = 0

        print (c1,c2)

        with open(base_directory+"/Databases/merged_ibcc.csv","wb") as f:
            f.write("a,b,c\n")
            for thing_index in range(len(things_list)):
                #print (thing_index,thing_prime_index)
                if thing_index == c2:
                    # we skipping this one
                    #print "skip"
                    pass
                else:
                    if thing_index == c1:
                        # merge
                        assert thing_index == thing_prime_index
                        assert len(list(overlap)) <= 1
                        #print zip(*things_list[c1])[0]
                        #print zip(*things_list[c1])[1]
                        #print zip(*things_list[c2])[1]
                        for (user_index1,marked1),(user_index2,marked2) in zip(things_list[c1],things_list[c2]):
                            assert user_index1 == user_index2
                            f.write(str(user_index1)+","+str(thing_prime_index)+","+str(marked1 or marked2)+"\n")
                    else:
                        # proceed as normal
                        # continue
                        for user_index, marked in things_list[thing_index]:
                            f.write(str(user_index)+","+str(thing_prime_index)+","+str(marked)+"\n")

                    thing_prime_index += 1

            #print thing_index
            #print thing_prime_index
            #assert thing_prime_index == (thing_index-1)

        __ibcc_init__("merged")
        ibcc.runIbcc(base_directory+"/Databases/merged_ibcc.py")
        p1 = load_ibcc_probabilities("base",c1)
        p2 = load_ibcc_probabilities("base",c2)
        p3 = load_ibcc_probabilities("merged",c1)
        print (p1,p2,p3)
        if p3 < (max(p1,p2)-0.01):
            break
Exemple #14
0
    def __runIBCC__(self):
        collection = self.db['merged_classifications'+str(self.cutoff)]

        self.user_list = []
        self.subject_list = []

        shutil.rmtree(self.baseDir+"ibcc")
        os.makedirs(self.baseDir+"ibcc")

        counter = -1

        for speciesGroup in self.species_groups:
            self.user_list = []
            self.subject_list = []

            count = []
            required_l = list(powerset(speciesGroup))
            prohibited_l = [[s for s in speciesGroup if not(s in r)] for r in required_l]

            counter += 1

            self.__createConfigFile(counter,len(required_l))
            ibcc_input_file = open(self.baseDir+"ibcc/"+str(counter)+".in","wb")


            for document in collection.find():
                user_name = document["user_name"]
                subject_zooniverse_id = document["subject_zooniverse_id"]
                user_species_list = document["species_list"]

                if not(subject_zooniverse_id in self.nonempty_list):
                    continue

                #IBCC requires an int ID for both user and subject - so convert
                if user_name in self.user_list:
                    userID = self.user_list.index(user_name)
                    count[userID] += 1
                else:
                    self.user_list.append(user_name)
                    userID = len(self.user_list)-1
                    count.append(1)

                if subject_zooniverse_id in self.subject_list:
                    subjectID = self.subject_list.index(subject_zooniverse_id)
                else:
                    self.subject_list.append(subject_zooniverse_id)
                    subjectID = len(self.subject_list)-1

                #which class does this classification count as?
                meet_required = [sorted(list(set(user_species_list).intersection(r))) == sorted(list(r)) for r in required_l]
                meet_prohibited = [tuple(set(user_species_list).intersection(p)) == () for p in prohibited_l]
                meet_overall = [r and p for (r, p) in zip(meet_required, meet_prohibited)]
                assert(sum([1. for o in meet_overall if o]) == 1)

                class_id = meet_overall.index(True)
                print(str(userID) + "," + str(subjectID) + "," + str(class_id), file=ibcc_input_file)

            ibcc_input_file.close()

            #now run IBCC
            ibcc.runIbcc(self.baseDir+"ibcc/"+str(counter)+"config.py")
            print(count)
Exemple #15
0
    classifications.append((user_index, subject_index, blank))

print "====----"
print errorCount

try:
    os.remove("/home/greg/Databases/condor_ibcc.out")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/condor_ibcc.mat")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/condor_ibcc.csv.dat")
except OSError:
    pass

with open("/home/greg/Databases/condor_ibcc.csv", "wb") as f:
    f.write("a,b,c\n")

    for u, s, b in classifications:
        f.write(str(u) + "," + str(s) + "," + str(b) + "\n")

print datetime.datetime.time(datetime.datetime.now())
ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())

pickle.dump(subjects, open("/home/greg/Databases/condor_ibcc.pickle", "wb"))
        else:
            userIndex = users.index(userName)

        if not(photoName in photos):
            photos.append(photoName)
            photoIndex = len(photos)- 1
        else:
            photoIndex = photos.index(photoName)

        if i in classification:
            print(str(userIndex)+","+str(photoIndex)+",1", file=f)
        else:
            print(str(userIndex)+","+str(photoIndex)+",0", file=f)

    f.close()
    ibcc.runIbcc(baseDir+"ibcc/"+str(i)+"config.py")

    #merge the results into the existing ones
    #assume all photos are now in the list - should be
    reader = csv.reader(open(baseDir+"ibcc/"+str(i)+".out","rU"), delimiter=" ")
    for photoIndex, neg, pos in reader:
        photoIndex = int(float(photoIndex))
        pos = float(pos)

        if len(ibccClassifications) < (photoIndex+1):
            ibccClassifications.append([])

        if pos > 0.5:
            #print(photoIndex,len(ibccClassifications))
            ibccClassifications[photoIndex].append(s)
Exemple #17
0
try:
    os.remove(base_directory + "/Databases/penguins_ibcc.out")
except OSError:
    pass

try:
    os.remove(base_directory + "/Databases/penguins_ibcc.mat")
except OSError:
    pass

try:
    os.remove(base_directory + "/Databases/penguins_ibcc.in.dat")
except OSError:
    pass

ibcc.runIbcc(base_directory + "/Databases/penguins_ibcc_config.py")

print "done that"

total = 0
true_positives = []
false_positives = []
with open(base_directory + "/Databases/penguins_ibcc.out", 'rb') as f:
    for l in f.readlines():
        penguin_index, neg_prob, pos_prob = l.split(" ")

        penguin = penguins[max_users][image_index][1][int(
            float(penguin_index))][0]

        #is this penguin "real" ie. is in the gold standard?
        if cluster_compare(gold_standard, [
        else:
            userIndex = users.index(userName)

        if not (photoName in photos):
            photos.append(photoName)
            photoIndex = len(photos) - 1
        else:
            photoIndex = photos.index(photoName)

        if i in classification:
            print(str(userIndex) + "," + str(photoIndex) + ",1", file=f)
        else:
            print(str(userIndex) + "," + str(photoIndex) + ",0", file=f)

    f.close()
    ibcc.runIbcc(baseDir + "ibcc/" + str(i) + "config.py")

    #merge the results into the existing ones
    #assume all photos are now in the list - should be
    reader = csv.reader(open(baseDir + "ibcc/" + str(i) + ".out", "rU"),
                        delimiter=" ")
    for photoIndex, neg, pos in reader:
        photoIndex = int(float(photoIndex))
        pos = float(pos)

        if len(ibccClassifications) < (photoIndex + 1):
            ibccClassifications.append([])

        if pos > 0.5:
            #print(photoIndex,len(ibccClassifications))
            ibccClassifications[photoIndex].append(s)
Exemple #19
0
with open("/home/greg/Databases/galaxy_zoo_ibcc.py", "wb") as f:
    f.write("import numpy as np\n")
    f.write("scores = np.array([0,1,2])\n")
    f.write("nScores = len(scores)\n")
    f.write("nClasses = 3\n")
    f.write("inputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.csv\"\n")
    f.write("outputFile = \"/home/greg/Databases/galaxy_zoo_ibcc.out\"\n")
    f.write("confMatFile = \"/home/greg/Databases/galaxy_zoo_ibcc.mat\"\n")
    f.write("nu0 = np.array([40,40,10])\n")
    f.write("alpha0 = np.array([[5, 2, 2], [2, 5, 2], [3, 3, 3]])\n")

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.out")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.mat")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/galaxy_zoo_ibcc.csv.dat")
except OSError:
    pass

import datetime
print datetime.datetime.time(datetime.datetime.now())
ibcc.runIbcc("/home/greg/Databases/galaxy_zoo_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())
Exemple #20
0
    os.remove(base_directory + "/Databases/condor_ibcc.out")
except OSError:
    pass

try:
    os.remove(base_directory + "/Databases/condor_ibcc.mat")
except OSError:
    pass

try:
    os.remove(base_directory + "/Databases/condor_ibcc.csv.dat")
except OSError:
    pass

# pickle.dump((big_subjectList,big_userList),open(base_directory+"/Databases/tempOut.pickle","wb"))
ibcc.runIbcc(base_directory + "/Databases/condor_ibcc.py")

# now analyze the data
# assume for starters that each image does not have a condor
X = []
Y = []
X_2 = []
Y_2 = []
contains_condor = {zooniverse_id: False for zooniverse_id in zooniverse_list}
condor_probabilities = {zooniverse_id: [] for zooniverse_id in zooniverse_list}
with open(base_directory + "/Databases/condor_ibcc.out", "rb") as f:
    ibcc_results = csv.reader(f, delimiter=" ")

    for row in ibcc_results:
        animal_index = int(float(row[0]))
        condor_p = float(row[2])
Exemple #21
0
print "====----"
print errorCount

try:
    os.remove("/home/greg/Databases/condor_ibcc.out")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/condor_ibcc.mat")
except OSError:
    pass

try:
    os.remove("/home/greg/Databases/condor_ibcc.csv.dat")
except OSError:
    pass

with open("/home/greg/Databases/condor_ibcc.csv", "wb") as f:
    f.write("a,b,c\n")

    for u, s, b in classifications:
        f.write(str(u) + "," + str(s) + "," + str(b) + "\n")


print datetime.datetime.time(datetime.datetime.now())
ibcc.runIbcc("/home/greg/Databases/condor_ibcc.py")
print datetime.datetime.time(datetime.datetime.now())

pickle.dump(subjects, open("/home/greg/Databases/condor_ibcc.pickle", "wb"))