Ejemplo n.º 1
0
        ("nist", "f", "scores.NIST"),
        ("bleu", "f", "scores.BLEU"),        
        ("exp_name", "S128"),        
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            classifier=None,
            data=data,
            _lang=lang_pairs or config["eval"][data].keys(),
            _score_attr=("freq_score","dup_score", "mup_score"),
            build=ex.SKIP,
            compute_classifier_score=ex.SKIP,
            write_text=ex.SKIP,
            write_diff=ex.SKIP,
            draw_graphs=ex.SKIP)
        
        for ns in exps: 
            result_store.append(ns)
                
  
if __name__ == "__main__":
    set_default_log(log_fname="_bounds.log")
    bounds()

    
    
Ejemplo n.º 2
0
            #    continue
            
            classifier = MultinomialNB()
            models_fname = join(exp_dir, "nb_models.hdf5")
            
            builder = NBModelBuilder(tab_fname, samp_fname, models_fname,
                                     classifier, graphs_pkl_fname=graphs_pkl_fname,
                                     counts_pkl_fname=counts_pkl_fname,
                                     feat_selector=SelectKBest(chi2, k))
            builder.run()
            nist, blue = score_model(lang_pair, exp_dir, draw=False)
            results[exp_count]["NIST"] = nist 
            results[exp_count]["BLUE"] = blue
            
            exp_count += 1
            
    results = results[:exp_count]
    print results
    results.dump("nb_feat_select_bst_results_2.pkl")
    
    



# for logging to stderr in utf-8 use:
set_default_log(level=logging.INFO)

import logging
logging.getLogger("model").setLevel(logging.DEBUG)    

run_all()    
Ejemplo n.º 3
0
    default=True,
    action="store_false",
    help="count single word entries (default is True)")

parser.add_argument(
    "--with-multi-word",
    default=False,
    action="store_true",
    help="cont multi word entries (default is False)")

parser.add_argument(
    "-v", "--verbose",
    action="store_true")

args = parser.parse_args()

if args.verbose:
    set_default_log()
            
            
ambig_dist_report(lang_pairs=args.lang_pairs, 
                  entry=args.entry,
                  with_single_word=args.with_single_word,
                  with_multi_word=args.with_multi_word)

        
    
    
    

Ejemplo n.º 4
0
                                fname_prefix = "_" + name)
    # tricky: 'classifiers' cannot be an iterator
    # because it is called many times during grid_search
    classifiers = list(lr_classifier(
        ))
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=config["eval"][data].keys(),
            #_lang=("de-en",),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            #build_models=lr_build_models,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)
                
  
if __name__ == "__main__":
    set_default_log(log_fname="_lr-1.log")
    lr_1(
        n_graphs=1
    )
Ejemplo n.º 5
0
        samples, targets = shuffle(data.samples, data.targets)
        
        for classifier in classifiers:
            scorer = Scorer()
            cross_val_score(classifier, 
                            samples, 
                            targets,
                            scoring=scorer)  
            params = (lemma,
                      pos,  
                      n_cand,
                      classifier.alpha,
                      classifier.loss,
                      classifier.n_iter,
                      classifier.penalty)
            results[i] =  params + tuple(scorer.mean_scores())
            i += 1
            np.save(results_fname, results[:i])
            text_table(results[:i], 
                       results_fname.replace(".npy", ".txt"))
            


if __name__ == "__main__":
    lang_pair = "de-en"
    set_default_log(log_fname="_sgd-cv-1_results_{}.log".format(lang_pair))
    results_fname = "_sgd-cv-1_results_{}.npy".format(lang_pair) 
    run_cv1(lang_pair, results_fname,
            #subset = {"anmelden/v*.full", "Magazin/n"}
            )
Ejemplo n.º 6
0
                               n_jobs=n_jobs)
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            classifier=classifier,
            data=data,
            _lang=lang or config["eval"][data].keys(),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            n_graphs=n_graphs,
            # *** input to SGDClassifier must be shuffled! ***
            shuffle=True,
            _class_weighting=(True, False),
        )
        
        for ns in exps: 
            result_store.append(ns)
                
  
if __name__ == "__main__":
    set_default_log(log_fname="_sgd-1.log")
    sgd_1(
        name = "sgd-1",
        #n_graphs=2,
        #lang=("de-en",),
        n_jobs=10
    )
Ejemplo n.º 7
0
    ] 
    result_store = ResultsStore(descriptor, 
                                fname_prefix = "_" + name)
    classifiers = list(nc_classifier(
        # Contrary to docs, l1 distance (manhattan) does NOT support sparse 
        _metric=("cosine", "euclidean")))
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=lang_pairs or config["eval"][data].keys(),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)
                
  
if __name__ == "__main__":
    set_default_log(log_fname="_nc_1.log")
    nc_1(
        data_sets = ("metis","presemt-dev"),
        #n_graphs=2,
    )
Ejemplo n.º 8
0
    
    # 'data' cannot be expanded  implicitly through grid search
    # because _lang expansion depends on its value :-(
    for data in data_sets:
        exps = ex.single_exp(
            name=name,
            _classifier=classifiers,
            data=data,
            _lang=config["eval"][data].keys(),
            #_lang=("de-en",),
            write_text=ex.SKIP,
            draw_graphs=ex.SKIP,
            build_models=nb_build_model,
            vectorizer=vectorizer,
            thrash_models=ex.thrash_models,
            n_graphs=n_graphs,
        )
        
        for ns in exps: 
            result_store.append(ns)
                
  
if __name__ == "__main__":
    set_default_log(log_fname="_fs-2.log")
    fs_2(data_sets=(
        "metis", 
        "presemt-dev"
        ),
         #n_graphs=2
         )
Ejemplo n.º 9
0
                    # so assume boolean
                    em[i,j] = 1
                    
        if log.isEnabledFor(logging.DEBUG):
            log.debug(u"{0} ==> {1}".format(
                target_lemma,
                ", ".join([str((reverse_vocab[j], count)) for j, count in zip(em.rows[i], em.data[i])])))
                   
    log.info("converting to csr_matrix") 
    return em.tocsr()



if __name__ == "__main__":
    from tg.utils import set_default_log
    set_default_log(level=logging.DEBUG)
    
    from tg.config import config
    
    extend_samples(#samp_hdf_fname = "en_samples_filtered.hdf5", 
                   samp_hdf_fname = "en_samples_subset_filtered.hdf5", 
                   tdict_pkl_fname = config["dict"]["de-en"]["pkl_fname"],
                   reverse_tdict_pkl_fname = config["dict"]["en-de"]["pkl_fname"],
                   #ext_hdf_fname = "en_samples_filtered_extended.hdf5",
                   ##ext_hdf_fname = "en_samples_subset_filtered_extended.hdf5",
                   ext_hdf_fname = "ff.hdf5",
                   max_samp = 1,
                   )
    
    #extend_samples(#samp_hdf_fname = "de_samples_filtered.hdf5", 
                   #samp_hdf_fname = "de_samples_subset_filtered.hdf5",