def get_master_impostors(id,nknown,problems,avg_len,opts=default_opts,sw=[],mode="test"): if mode.startswith("test"): id=id+"___" master_impostors=[] ids_candidates=[] for i,(id_,(ks,uks)) in enumerate(problems): if id != id_ and i < len(problems)-nknown: ids_candidates.append(i) pos=range(len(ids_candidates)) random.shuffle(pos) master_impostors=[] for i in range(opts.nimpostors): for j in range(opts.ndocs): filename,doc=problems[pos[i*opts.nimpostors+j]][1][0][0] doc,text=docread.tag(filename,doc,opts.language) master_impostors.append((doc[:avg_len[1]],text[:avg_len[0]])) return master_impostors
def get_master_impostors(id, nimpostors, ndocs, nknown, problems, sw=[], mode="test", cutoff=0, lang="en"): if mode.startswith("test"): id = id + "___" master_impostors = [] ids_candidates = [] for i, (id_, (ks, uks)) in enumerate(problems): if id != id_ and i < len(problems) - nknown: ids_candidates.append(i) pos = range(len(ids_candidates)) random.shuffle(pos) for i in range(nimpostors): for j in range(ndocs): id_ = pos[i * nimpostors + j] for k in range(nknown): master_candidate = {} doc = problems[ids_candidates[id_] + k] doc, text = docread.tag(doc[1][0][0][0], doc[1][0][0][1], lang) for repname in opts.reps: try: exec("f=docread.{0}".format(repname)) rep = f(doc, text, cutoff=cutoff, sw=sw) except: rep = Counter() try: master_candidate[repname].update(rep) except KeyError: master_candidate[repname] = Counter(rep) master_impostors.append(master_candidate) return master_impostors
def process_corpus(problems,impostor_problems,opts=default_opts,mode="test",sw=[],verbose=lambda *a: None ): #Iterating over problems if opts.nmax>0: problems=problems[:opts.nmax] dumpfiles=[] if opts.dump: dumpfiles=[open('answers_{0}.dump'.format(iter),'w') for iter in range(opts.iters)] for id,(ks,uks) in problems: verbose("Problem",id) master_author={} docs_author=[] master_unknown={} full_voca={} masters=[] for filename,doc in uks: masters.append(docread.tag(filename,doc,opts.language)) for filename,doc in ks: masters.append(docread.tag(filename,doc,opts.language)) avg_char_len=[len(x[1]) for x in masters[1:]] avg_word_len=[len(x[1].split()) for x in masters[:1]] avg_len=sum(avg_char_len)/(len(masters)-1),sum(avg_word_len)/(len(masters)-1) results=[] for iter in range(opts.iters): #Extracting Examples masters_=[x for x in masters] # Getting impostors master_impostors=get_master_impostors(id,len(masters)-1,impostor_problems,avg_len,opts=opts,mode=mode,sw=sw) masters_.extend(master_impostors) reps=[] for repname in opts.reps: exec("f=docread.{0}".format(repname)) rep=f(masters_,cutoff=opts.cutoff,sw=sw) if type(rep) is scipy.sparse.csr.csr_matrix: rep = rep.toarray() reps.append(rep) data=np.hstack(reps) unknown=data[0,:] data=data[1:,:] result=try_hbc(data,unknown,len(masters_)-1,len(masters)-1,opts) results.append(result) if opts.dump: prob=sum(results)/(iter+1) if prob > 0.45 and prob < 0.55: prob=0.5 print >> dumpfiles[iter], id, prob prob=sum(results)/opts.iters if prob > 0.45 and prob < 0.55: prob=0.5 print(id, prob) for f in dumpfiles: f.close()
def process_corpus(problems, impostor_problems, opts, mode, sw): #Iterating over problems if opts.nmax > 0: problems = problems[:opts.nmax] dumpfiles = [] if opts.dump: dumpfiles = [ open('answers_{0}.dump'.format(iter), 'w') for iter in range(opts.iters) ] for id, (ks, uks) in problems: print >> sys.stderr, "Problem", id master_author = {} docs_author = [] master_unknown = {} full_voca = {} ks_ = ks for filename, doc in ks: doc, text = docread.tag(filename, doc, opts.language) doc_author = {} for repname in opts.reps: #try: exec("f=docread.{0}".format(repname)) rep = f(doc, text, cutoff=opts.cutoff, sw=sw) #except: # rep=Counter() doc_author[repname] = rep try: master_author[repname].update(rep) except KeyError: master_author[repname] = Counter(rep) try: full_voca[repname].update(rep) except KeyError: full_voca[repname] = Counter(rep) docs_author.append(doc_author) for filename, doc in uks: doc, text = docread.tag(filename, doc, opts.language) for repname in opts.reps: try: exec("f=docread.{0}".format(repname)) rep = f(doc, text, sw=sw, cutoff=opts.cutoff) except: rep = Counter() try: master_unknown[repname].update(rep) except KeyError: master_unknown[repname] = Counter(rep) try: full_voca[repname].update(rep) except KeyError: full_voca[repname] = Counter(rep) results = [] iters = opts.iters #print "============" #print ">>>", master_unknown #print id, master_author for iter in range(iters): print >> sys.stderr, "Iter", iter #Extracting Examples examples = [] lens = [] # Adding impostors master_impostors = get_master_impostors(id, opts.nimpostors, opts.documents, len(ks), impostor_problems, mode=mode, sw=sw, cutoff=opts.cutoff, lang=opts.language) #print ">>>>>>>>>",len(master_impostors) #for mi in master_impostors: #print ">>>>",mi for j, master_impostor in enumerate(master_impostors): examples.append( muestreo(master_impostor, opts.reps, percentage=opts.percentage)) for j in range(opts.documents): for i in range(len(ks)): doc_author = docs_author[i] examples.append( muestreo(doc_author, opts.reps, percentage=opts.percentage)) lens.append(len(ks_)) #print "<<<<<<<<<<<<",len(examples) sample_unknown = muestreo(master_unknown, opts.reps, percentage=1.0) # Sparce algorithm # Proyecting examples into a vector example_vectors, unknown = project_into_vectors( examples, full_voca, sample_unknown, opts.reps) #print unknown #for example in enumerate(example_vectors): # prin t len(example),example answer = False nanswers = 0 while not answer: if nanswers > 4: results = [0.0 for i in range(iters)] break try: # Creating matrix A # First samples represent to author, rest impostors # Normalizing the data A = np.matrix(example_vectors) A_ = A.T y = np.matrix(unknown) y_ = y.T nu = 0.0000001 tol = 0.0000001 #AA=[v for v in example_vectors] #AA.append(unknown) #AAA=np.array(AA) #AAA.shape #pl.pcolor(AAA,cmap=pl.cm.Blues) #pl.title("A") #pl.show() stopCrit = 3 x_0, nIter = octave.SolveHomotopy(A_, y_, 'lambda', nu, 'tolerance', tol, 'stoppingcriterion', stopCrit) #ind=np.arange(x_0.shape[0]) #pl.bar(ind,[np.float(x) for x in x_0]) #pl.title("X_0") #pl.show() # Calculating residuals residuals = [] d_is = [] k = len(examples) / len(ks) * opts.documents for i in range(k): n = opts.documents * len(ks) d_i= np.matrix([[0.0 for x in x_0[:i*n]]+\ [np.float(x) for x in x_0[i*n:(i+1)*n]]+\ [0.0 for x in x_0[(i+1)*n:]]]).T d_is.append(np.linalg.norm(d_i, ord=1)) #print "y",y #print "y_",(A_*d_i).T r_is = y_ - A_ * d_i r_i = np.linalg.norm(r_is, ord=2) residuals.append(r_i) #print residuals sci = (k * np.max(d_is) / np.linalg.norm(x_0, ord=1) - 1) / (k - 1) identity = np.argmin(residuals) #print sci, identity scith = 0.1 if sci < scith: results.append(0.0) else: if identity == (k - 1): results.append(1.0) else: results.append(0.0) #ind=np.arange(len(residuals)) #pl.bar(ind,residuals) #pl.title(str(sci)+"---"+id+"----"+str(results[-1])+"---"+str(scith)) #pl.show() nanswers += 1 answer = True except Oct2PyError: nanswers += 1 pass except TypeError: nanswers += 1 pass if opts.dump: print >> dumpfiles[iter], id, sum(results) / (iter + 1) print id, sum(results) / iters for f in dumpfiles: f.close()