def test_stemmed_all_bigrams(corpus_train_data,corpus_test_data,vocabulary_src,with_stopwords_removal,use_chi_features,use_raw_tokens): from stemming_tokenizer import StemmingTokenizer from stemming_tokenizer import RawStemmingTokenizer from reuters_globals import * from reuters_vocabulary_loader import load_common_vocabulary max_ngram_size = 2 if with_stopwords_removal==False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features==False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens==False: raw_tokens_pattern = '' tokenizer = StemmingTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawStemmingTokenizer() # load vocabulary vocabulary_tbl_name1 = 'reuters21578_{0}{1}_stems{2}_unigrams{3}_df{4}_tf{5}'.format(vocabulary_src,raw_tokens_pattern,chi_features_pattern,stopwords_pattern,min_df,min_tf) vocabulary_tbl_name2 = 'reuters21578_{0}{1}_stems_bigrams{3}_df{4}_tf{5}'.format(vocabulary_src,raw_tokens_pattern,chi_features_pattern,stopwords_pattern,min_df,min_tf) vocabulary_tbl_intersect = 'wiki_wiktionary_google_bigrams_vw' vocabulary = load_common_vocabulary(vocabulary_tbl_name1,vocabulary_tbl_name2,vocabulary_tbl_intersect,'stem') # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],tokenizer,vocabulary,max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],tokenizer,vocabulary,max_ngram_size) # classify & evaluate results = classify(corpus_train_tfidf_vectors,corpus_train_data['labels'], corpus_test_tfidf_vectors,corpus_test_data['labels'], test_set_size,max_labels) print vocabulary_tbl_name1,'^',vocabulary_tbl_name2,'^',vocabulary_tbl_intersect,' --> ','precision ',results['precision'],'recall ',results['recall'],'f1 ',results['f1']
def build_stemmed_unigrams_stopwords_vocabulary(corpus, labels, stop_words, vocabulary_src): from clef_globals import * from clef_vocabulary_loader import load_vocabulary from stemming_tokenizer import StemmingTokenizer tokenizer = StemmingTokenizer() max_ngram_size = 1 # load initial vocabulary initial_vocabulary_tbl_name = 'clef_2010_{0}_stems_unigrams_stopwords_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) initial_vocabulary = load_vocabulary(initial_vocabulary_tbl_name) vocabulary = build_chi_vocabulary(corpus, labels, initial_vocabulary, tokenizer, stop_words, max_ngram_size, min_df, min_tf) # save to DB tbl_name = 'clef_2010_{0}_stems_chi_unigrams_stopwords_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) save_vocabulary(vocabulary, tbl_name) print 'done ' + tbl_name