Esempio n. 1
0
def get_master_impostors(id,nknown,problems,avg_len,opts=default_opts,sw=[],mode="test"):
    if mode.startswith("test"):
        id=id+"___"
    master_impostors=[]
    ids_candidates=[]
    for i,(id_,(ks,uks)) in enumerate(problems):
        if id != id_ and i < len(problems)-nknown:
            ids_candidates.append(i)
    pos=range(len(ids_candidates))
    random.shuffle(pos)

    master_impostors=[]
    for i in range(opts.nimpostors):
        for j in range(opts.ndocs):
            filename,doc=problems[pos[i*opts.nimpostors+j]][1][0][0]
            doc,text=docread.tag(filename,doc,opts.language)
            master_impostors.append((doc[:avg_len[1]],text[:avg_len[0]]))
    return master_impostors
Esempio n. 2
0
def get_master_impostors(id,
                         nimpostors,
                         ndocs,
                         nknown,
                         problems,
                         sw=[],
                         mode="test",
                         cutoff=0,
                         lang="en"):
    if mode.startswith("test"):
        id = id + "___"
    master_impostors = []
    ids_candidates = []
    for i, (id_, (ks, uks)) in enumerate(problems):
        if id != id_ and i < len(problems) - nknown:
            ids_candidates.append(i)
    pos = range(len(ids_candidates))
    random.shuffle(pos)

    for i in range(nimpostors):
        for j in range(ndocs):
            id_ = pos[i * nimpostors + j]
            for k in range(nknown):
                master_candidate = {}
                doc = problems[ids_candidates[id_] + k]
                doc, text = docread.tag(doc[1][0][0][0], doc[1][0][0][1], lang)
                for repname in opts.reps:
                    try:
                        exec("f=docread.{0}".format(repname))
                        rep = f(doc, text, cutoff=cutoff, sw=sw)
                    except:
                        rep = Counter()
                    try:
                        master_candidate[repname].update(rep)
                    except KeyError:
                        master_candidate[repname] = Counter(rep)

                master_impostors.append(master_candidate)
    return master_impostors
Esempio n. 3
0
def process_corpus(problems,impostor_problems,opts=default_opts,mode="test",sw=[],verbose=lambda *a: None ):
    #Iterating over problems
    if opts.nmax>0:
        problems=problems[:opts.nmax]

    dumpfiles=[]
    if opts.dump:
        dumpfiles=[open('answers_{0}.dump'.format(iter),'w') 
                    for iter in range(opts.iters)]

    for id,(ks,uks) in problems:
        verbose("Problem",id)
        master_author={}
        docs_author=[]
        master_unknown={}
        full_voca={}

        masters=[]
    
        for filename,doc in uks:
            masters.append(docread.tag(filename,doc,opts.language))
        
        for filename,doc in ks:
            masters.append(docread.tag(filename,doc,opts.language))

        avg_char_len=[len(x[1]) for x in masters[1:]]
        avg_word_len=[len(x[1].split()) for x in masters[:1]]
        avg_len=sum(avg_char_len)/(len(masters)-1),sum(avg_word_len)/(len(masters)-1)

        results=[]
        for iter in range(opts.iters):
            #Extracting Examples
            masters_=[x for x in masters]

            # Getting impostors
            master_impostors=get_master_impostors(id,len(masters)-1,impostor_problems,avg_len,opts=opts,mode=mode,sw=sw)
            masters_.extend(master_impostors)

            reps=[]
            for repname in opts.reps:
                exec("f=docread.{0}".format(repname))
                rep=f(masters_,cutoff=opts.cutoff,sw=sw)
                if type(rep) is scipy.sparse.csr.csr_matrix:
                    rep = rep.toarray()
                reps.append(rep)
            data=np.hstack(reps)
            unknown=data[0,:]
            data=data[1:,:]

            result=try_hbc(data,unknown,len(masters_)-1,len(masters)-1,opts)
            results.append(result)

            if opts.dump:
                prob=sum(results)/(iter+1)
                if prob > 0.45 and prob < 0.55:
                    prob=0.5
                print >> dumpfiles[iter], id, prob
        prob=sum(results)/opts.iters
        if prob > 0.45 and prob < 0.55:
            prob=0.5
        print(id, prob)
    for f in dumpfiles:
        f.close()
Esempio n. 4
0
def process_corpus(problems, impostor_problems, opts, mode, sw):
    #Iterating over problems
    if opts.nmax > 0:
        problems = problems[:opts.nmax]

    dumpfiles = []
    if opts.dump:
        dumpfiles = [
            open('answers_{0}.dump'.format(iter), 'w')
            for iter in range(opts.iters)
        ]

    for id, (ks, uks) in problems:
        print >> sys.stderr, "Problem", id
        master_author = {}
        docs_author = []
        master_unknown = {}
        full_voca = {}
        ks_ = ks
        for filename, doc in ks:
            doc, text = docread.tag(filename, doc, opts.language)
            doc_author = {}
            for repname in opts.reps:
                #try:
                exec("f=docread.{0}".format(repname))
                rep = f(doc, text, cutoff=opts.cutoff, sw=sw)
                #except:
                #    rep=Counter()
                doc_author[repname] = rep
                try:
                    master_author[repname].update(rep)
                except KeyError:
                    master_author[repname] = Counter(rep)
                try:
                    full_voca[repname].update(rep)
                except KeyError:
                    full_voca[repname] = Counter(rep)
            docs_author.append(doc_author)

        for filename, doc in uks:
            doc, text = docread.tag(filename, doc, opts.language)
            for repname in opts.reps:
                try:
                    exec("f=docread.{0}".format(repname))
                    rep = f(doc, text, sw=sw, cutoff=opts.cutoff)
                except:
                    rep = Counter()
                try:
                    master_unknown[repname].update(rep)
                except KeyError:
                    master_unknown[repname] = Counter(rep)
                try:
                    full_voca[repname].update(rep)
                except KeyError:
                    full_voca[repname] = Counter(rep)

        results = []
        iters = opts.iters

        #print "============"
        #print ">>>", master_unknown
        #print id, master_author

        for iter in range(iters):
            print >> sys.stderr, "Iter", iter
            #Extracting Examples
            examples = []
            lens = []
            # Adding impostors

            master_impostors = get_master_impostors(id,
                                                    opts.nimpostors,
                                                    opts.documents,
                                                    len(ks),
                                                    impostor_problems,
                                                    mode=mode,
                                                    sw=sw,
                                                    cutoff=opts.cutoff,
                                                    lang=opts.language)
            #print ">>>>>>>>>",len(master_impostors)
            #for mi in master_impostors:
            #print ">>>>",mi
            for j, master_impostor in enumerate(master_impostors):
                examples.append(
                    muestreo(master_impostor,
                             opts.reps,
                             percentage=opts.percentage))

            for j in range(opts.documents):
                for i in range(len(ks)):
                    doc_author = docs_author[i]
                    examples.append(
                        muestreo(doc_author,
                                 opts.reps,
                                 percentage=opts.percentage))
                    lens.append(len(ks_))
            #print "<<<<<<<<<<<<",len(examples)

            sample_unknown = muestreo(master_unknown,
                                      opts.reps,
                                      percentage=1.0)

            # Sparce algorithm
            # Proyecting examples into a vector
            example_vectors, unknown = project_into_vectors(
                examples, full_voca, sample_unknown, opts.reps)
            #print unknown
            #for example in enumerate(example_vectors):
            #    prin t len(example),example
            answer = False
            nanswers = 0

            while not answer:
                if nanswers > 4:
                    results = [0.0 for i in range(iters)]
                    break
                try:
                    # Creating matrix A
                    # First samples represent to author, rest impostors
                    # Normalizing the data
                    A = np.matrix(example_vectors)
                    A_ = A.T
                    y = np.matrix(unknown)
                    y_ = y.T
                    nu = 0.0000001
                    tol = 0.0000001

                    #AA=[v for v in example_vectors]
                    #AA.append(unknown)
                    #AAA=np.array(AA)
                    #AAA.shape

                    #pl.pcolor(AAA,cmap=pl.cm.Blues)
                    #pl.title("A")
                    #pl.show()

                    stopCrit = 3
                    x_0, nIter = octave.SolveHomotopy(A_, y_, 'lambda', nu,
                                                      'tolerance', tol,
                                                      'stoppingcriterion',
                                                      stopCrit)
                    #ind=np.arange(x_0.shape[0])
                    #pl.bar(ind,[np.float(x) for x in x_0])
                    #pl.title("X_0")
                    #pl.show()

                    # Calculating residuals
                    residuals = []
                    d_is = []
                    k = len(examples) / len(ks) * opts.documents
                    for i in range(k):
                        n = opts.documents * len(ks)
                        d_i= np.matrix([[0.0 for x in x_0[:i*n]]+\
                             [np.float(x) for x in x_0[i*n:(i+1)*n]]+\
                             [0.0 for x in x_0[(i+1)*n:]]]).T
                        d_is.append(np.linalg.norm(d_i, ord=1))
                        #print "y",y
                        #print "y_",(A_*d_i).T
                        r_is = y_ - A_ * d_i
                        r_i = np.linalg.norm(r_is, ord=2)
                        residuals.append(r_i)
                    #print residuals

                    sci = (k * np.max(d_is) / np.linalg.norm(x_0, ord=1) -
                           1) / (k - 1)
                    identity = np.argmin(residuals)
                    #print sci, identity
                    scith = 0.1
                    if sci < scith:
                        results.append(0.0)
                    else:
                        if identity == (k - 1):
                            results.append(1.0)
                        else:
                            results.append(0.0)
                    #ind=np.arange(len(residuals))
                    #pl.bar(ind,residuals)
                    #pl.title(str(sci)+"---"+id+"----"+str(results[-1])+"---"+str(scith))
                    #pl.show()
                    nanswers += 1
                    answer = True

                except Oct2PyError:
                    nanswers += 1
                    pass
                except TypeError:
                    nanswers += 1
                    pass

            if opts.dump:
                print >> dumpfiles[iter], id, sum(results) / (iter + 1)
        print id, sum(results) / iters
    for f in dumpfiles:
        f.close()