def init_ground(self): """ Predicts on testing data and updates self.current_detection_rate """ data_y, data_x = h.compose(self.train_y, self.train_x, self.pol_y, self.pol_x) data_y = h.strip(data_y) # strip None data_x = h.strip(data_x) m = svm.train(data_y, data_x, self.params) p_label, p_acc, p_val = svm.predict(self.test_y, self.test_x, m) self.p_label = p_label self.p_val = h.delist(p_val) self.current_detection_rate = p_acc[0]
def __init__(self, s, sett): self.sett = sett self.s = s self.words = h.strip(s).split() self.score = sett.calcScore(s) self.count = len(sett.canRegen) self.children = [None] * len(sett.canRegen) print "--Created node [", s, "]", self.score
def __init__(self, s, sett): self.s = s self.sett = sett self.isbad = False try: # fixes unicode characters trying to sneak through; see https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string self.words = h.strip(s).split() except Exception as e: #print s, e self.isbad = True self.score = None #sett.calcScore
def oldeval(s): global temp if temp: return None s = h.strip(s) words = s.split() i = random.choice([0,1,2,5]) #TODO add 4 ret = words print "EVAL REPLACE:",i,ret[i] ret[i] = None temp = True return ret
def parseValueFormat(self, value): value = helpers.strip(value) ## Integer/floating portion of number = group 1 ## Metric prefix = group 2 tokens = match(r'([0-9]+\.?[0-9]*)\s?([a-zA-Z]?)', value) try: componentValue = float(tokens.group(1)) prefixValue = METRIC_PREFIX_VALUES[tokens.group(2)] except AttributeError as ae: alert = "Invalid value for the input '{0}'. Input should be of the form <number> <metric prefix> where the number is a float or integer. Quitting now." helpers.eclbPrint(alert.format(value)) return None except KeyError as ke: alert = "Invalid metric prefix for the input '{0}'. Input should be of the form <number> <metric prefix> where the prefix is in the list {1}. Quitting now." helpers.eclbPrint(alert.format(value, [prefix for prefix in METRIC_PREFIX_VALUES])) return None return float(componentValue * prefixValue)
def parseValueFormat(self, value): value = helpers.strip(value) ## Integer/floating portion of number = group 1 ## Metric prefix = group 2 tokens = match(r'([0-9]+\.?[0-9]*)\s?([a-zA-Z]?)', value) try: componentValue = float(tokens.group(1)) prefixValue = METRIC_PREFIX_VALUES[tokens.group(2)] except AttributeError as ae: alert = "Invalid value for the input '{0}'. Input should be of the form <number> <metric prefix> where the number is a float or integer. Quitting now." helpers.eclbPrint(alert.format(value)) return None except KeyError as ke: alert = "Invalid metric prefix for the input '{0}'. Input should be of the form <number> <metric prefix> where the prefix is in the list {1}. Quitting now." helpers.eclbPrint( alert.format(value, [prefix for prefix in METRIC_PREFIX_VALUES])) return None return float(componentValue * prefixValue)
def cluster_remaining(au, working_set): """ This function is called if weighted_initial returns NO_CENTROIDS, meaning there are no more misabeled emails to use as centers. The remaining emails in the working set are then returned as one cluster. """ print "No more cluster centroids, grouping all remaining emails into one cluster" first_state_rate = au.current_detection_rate size = len(h.strip(working_set[0] + working_set[2])) # get number of remaining emails init_email = None init_pos = None label = None data_y, data_x = h.compose_set(working_set) for i,l in enumerate(data_y): # loop to find first email that is not none if l is not None: label = l init_pos = i init_email = data_x[i] center = (init_email, init_pos) cluster = Cluster(center, size, au, label, au.distance_opt, working_set=working_set) au.unlearn(cluster) au.init_ground() new_detection_rate = au.current_detection_rate au.learn(cluster) # relearn cluster in real training space so deltas of future cluster are not influenced second_state_rate = au.current_detection_rate net_rate_change = second_state_rate - first_state_rate au.current_detection_rate = first_state_rate assert(au.current_detection_rate == first_state_rate), str(au.current_detection_rate) + " " + str(first_state_rate) print "clustered remaining with a net rate change of ", second_state_rate, " - ", first_state_rate, " = ", net_rate_change return net_rate_change, cluster
def doit(topic,noun,w2v,pens,retries=0): #if not stanford.check(): # print "START THE SERVER" # raw_input('Press Enter...') f = random.choice(formats) form = f[0] axis = f[1] canRegen = f[2] s = form(topic,noun,w2v) regenf = lambda lock: form(topic,noun,w2v,lock) scoref = lambda x: h.getSkipScores(axis[0],axis[1][0],axis[1][1],x,pens) if s is None or isBad(h.getV(s)): if retries > 20: return None print "RETRYING" return doit(topic,noun,w2v,pens,retries+1) else: #instead of just randomly genning one story, randomly gen one for each verb (species) to get started? best = priority.best(s,regenf,canRegen,scoref)[0] raw = h.strip(best).split()[:3] notraw = best.split() best = ". ".join([h.firstCharUp(h.makePlural(r)) for r in raw])+". "+" ".join(notraw[3:]) print best,"\n" return best
def getIndex(story, i): return h.strip(story.split(' ')[i])
def __init__(self, s, sett): self.sett = sett self.s = s self.words = h.strip(s).split() self.score = None #sett.calcScore
if c == p and i != j: interpos[i].append(j) genss = [] finalgarbs = [] scoress = [] badis = [] import matplotlib.pyplot as plt for i, g in enumerate(garbs): if not g: genss.append(None) continue grb = g[:] f = formats[i] axes = f[1] gs = [h.strip(' '.join(grb))] ss = h.strip(random.choice(newmicro.doit(formats, w2v, pens, f, False))[0]) genss.append(ss) for j, w in enumerate( ss.split(' ') ): #(f['words']): #use a generated story instead of f['words'] to avoid bias toward the axis story itself grb[j] = w gs.append(h.strip(' '.join(grb))) scs = h.getSkipScores(axes[0], axes[1], axes[2], gs, pens) finalgarbs.append(gs) scoress.append(scs) prev = -10000 for s in scs: if s < prev: print i, f[3]['raw'], axes, "\n", [(gs[i], scs[i]) for i in range(len(gs))], "\n"
def makeFormats(w2v, pens, bestaxes=True, w2vmax=30, w2vmin=10, backoff=False, verbgen=False): ret = [] ex = 0 seen = set() for fraw in formatssw.makeAllRawForms(): if fraw['raw'] in seen: continue seen.add(fraw['raw']) s = allIndices(fraw['root']) if s != set([0, 1, 2, 3, 4, 5]) or not checkChars(fraw['plug']): #print "SKIP:", fraw['raw'], s ex += 1 continue processPOS( fraw['root'], w2v ) #Preprocess each node by checking whether word_pos is in w2v and massage them if possible genf = lambda lock, fraw=fraw, w2v=w2v, w2vmax=w2vmax, w2vmin=w2vmin, verbgen=verbgen: gen( fraw, w2v, lock, w2vmax, w2vmin, verbgen) regen = range(6) del regen[fraw['root']['index']] goodstory = h.strip(" ".join(fraw['words'])) ret.append([genf, [badstory, goodstory, goodstory, True], regen, fraw]) if ex: print "Number of excluded (bad) formats:", ex, "(%d total, %f%%)" % ( len(ret), (float(ex) / len(ret) * 100)) poss = [] for tup in ret: f = tup[3] poss.append(''.join( posListRec(f['root'], [None, None, None, None, None, None]))) interpos = defaultdict( list ) #dictionary of format index to list of other indices that have same POS for i, c in enumerate(poss): for j, p in enumerate(poss): if c == p and i != j: interpos[i].append(j) if not bestaxes: for i, tup in enumerate(ret): sames = interpos[i] otheraxis = None axes = tup[1] if len(sames) < 1: otheraxis = axes[1] #duplicate single good axis else: otheraxis = h.strip(ret[random.choice(sames)][3]['raw']) axes[2] = otheraxis else: #========== # calculated best axes for each cluster of 3+ stories (or read from file if stored there) # all 1- or 2-cluster formats will get 1 or 2 different axes, respectively, and be flagged (axes[3] == True) that they need the "10-20% cutoff" instead possets = [] for k in interpos: found = False for s in possets: if k in s: found = True break if found: continue possets.append(set(interpos[k] + [k])) scoresfn = 'axesscores' axscores = {} with open(scoresfn, 'r') as f: for line in f: line = line.strip() parts = line.split('\t') axscores[parts[0]] = float(parts[1]) for interis in possets: if len(interis) == 2: newaxes = [getstory(j, ret) for j in interis] for i in interis: ret[i][1] = ret[i][1][:1] + newaxes + [ True ] #note: difference between l[:1] and 1[0] is that the former returns a list! continue #else: use non-exemplar best axes candidates = {} for ai1, ai2 in combinations(interis, 2): k = getstory(ai1, ret) + "; " + getstory(ai2, ret) v = 0 if k in axscores: v = axscores[k] else: v = testaxes(ai1, ai2, interis, pens, ret) axscores[k] = v #for posterity candidates[k] = v best = sorted(candidates.keys(), key=lambda k: candidates[k], reverse=True) for i in interis: exemplar = getstory(i, ret) besti = 0 while exemplar in best[ besti]: #pick the best axes that don't include the format's exemplar (avoid plagiarism) besti += 1 newaxes = best[besti].split('; ') ret[i][1] = ret[i][1][:1] + newaxes with open(scoresfn, 'w') as fout: for k in axscores: fout.write(k + "\t" + str(axscores[k]) + "\n") #========== if backoff: bests = [] partial = [] for f in ret: s = h.strip(f[3]['raw']) if s not in f[1]: bests.append(f) elif s != f[1][2]: partial.append(f) if bests: return bests if partial: return partial return ret
def getstory(i, fmts): return h.strip(fmts[i][3]['raw'])
def getstory(i): return h.strip(formats[i][3]['raw'])
def cluster_au(au, gold=True): """Clusters the training space of an ActiveUnlearner and returns the list of clusters.""" print "\n----------------------Beginning the Clustering Process-----------------------\n" cluster_list = [] # list of tuples (net_rate_change, cluster) train_y = copy.deepcopy(au.train_y) train_x = copy.deepcopy(au.train_x) pol_y = copy.deepcopy(au.pol_y) pol_x = copy.deepcopy(au.pol_x) training = [train_y, train_x, pol_y, pol_x] # create the working set original_training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) print "\nResetting mislabeled...\n" mislabeled = au.get_mislabeled(update=True) # gets an array of all false positives, false negatives au.mislabeled_chosen = [] # reset set of clustered mislabeled emails in this instance of au print "\n Clustering...\n" pre_cluster_rate = au.current_detection_rate training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) while training_size > 0: # loop until all emails in phantom training space have been assigned print "\n-----------------------------------------------------\n" print "\n" + str(training_size) + " emails out of " + str(original_training_size) + \ " still unclustered.\n" # Choose an arbitrary email from the mislabeled emails and returns the training email closest to it. # Final call and source of current_seed is mislabeled_initial() function # current_seed = cluster_methods(au, "mislabeled", training, mislabeled) current_seed = None label = None while current_seed is None: label, init_pos, current_seed = au.select_initial(mislabeled, "weighted", training) if str(current_seed) == 'NO_CENTROIDS': cluster_result = cluster_remaining(au, training) else: cluster_result = determine_cluster(current_seed, au, label, init_pos, working_set=training, gold=gold) # if true, relearn clusters after returning them if cluster_result is None: print "!!!How did this happen?????" sys.exit(cluster_result) net_rate_change, cluster = cluster_result # After getting the cluster and net_rate_change, you relearn the cluster in original dataset if impact=True post_cluster_rate = au.current_detection_rate # make sure the cluster was properly relearned # assert(post_cluster_rate == pre_cluster_rate), str(pre_cluster_rate) + " " + str(post_cluster_rate) # print "cluster relearned successfully: au detection rate back to ", post_cluster_rate cluster_list.append([net_rate_change, cluster]) print "\nRemoving cluster from shuffled training set...\n" h.unlearn(training, cluster.cluster_set) training_size = len(h.strip(pol_y)) + len(h.strip(train_y)) cluster_list.sort() # sorts by net_rate_change print "\nClustering process done and sorted.\n" return cluster_list
def getindex(s): for i, f in enumerate(formats): if h.strip(f[3]['raw']) == s: return i