def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data, vocabulary_src, with_stopwords_removal, use_chi_features, use_raw_tokens, num_components): from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from clef_globals import min_df, min_tf, test_set_size, max_labels from clef_vocabulary_loader import load_vocabulary from sklearn.decomposition import TruncatedSVD from scipy import sparse import numpy max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format( vocabulary_src, raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # apply LSA #print numpy.max(corpus_train_tfidf_vectors) #print numpy.min(corpus_train_tfidf_vectors) lsa = TruncatedSVD(n_components=num_components) lsa.fit(corpus_train_tfidf_vectors) #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose()) corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors) corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], test_set_size, max_labels) print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[ 'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def test_lemmatized_wiktionary_google_bigrams(corpus_train_data, corpus_test_data, vocabulary_src, with_stopwords_removal, use_chi_features, use_raw_tokens): from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from clef_globals import min_df, min_tf, test_set_size, max_labels from clef_vocabulary_loader import load_common_vocabulary max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name1 = 'clef_2010_{0}{1}_lemmas{2}_unigrams{3}_df{4}_tf{5}'.format( vocabulary_src, raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary_tbl_name2 = 'clef_2010_{0}{1}_lemmas_bigrams{3}_df{4}_tf{5}'.format( vocabulary_src, raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary_tbl_intersect = 'wiktionary_google_bigrams_vw' vocabulary = load_common_vocabulary(vocabulary_tbl_name1, vocabulary_tbl_name2, vocabulary_tbl_intersect, 'lemma') # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], test_set_size, max_labels) print vocabulary_tbl_name1, '^', vocabulary_tbl_name2, '^', vocabulary_tbl_intersect, ' --> ', 'precision ', results[ 'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def build_raw_lemmatized_bigrams_stopwords_vocabulary(corpus,stop_words,): from ng20_globals import min_tf,min_df from commons.lemmatizing_tokenizer import RawLemmaTokenizer tokenizer = RawLemmaTokenizer() max_ngram_size = 2 vocabulary = build_vocabulary(corpus,tokenizer,stop_words,max_ngram_size,min_df,min_tf) # save to DB tbl_name = 'ng20_raw_lemmas_bigrams_stopwords_df{0}_tf{1}'.format(min_df,min_tf) save_vocabulary(vocabulary,tbl_name) print 'done '+tbl_name
def test_lemmatized_bigrams_unigrams(bigrams_src, corpus_train_data, corpus_test_data, label_names, with_stopwords_removal, use_chi_features, use_raw_tokens): import numpy as np from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from ng20_globals import max_labels, min_df, min_tf from ng20_vocabulary_loader import load_common_vocabulary_extend_unigrams max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'ng20{0}_lemmas{1}_bigrams{2}_df{3}_tf{4}'.format( raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) if len(bigrams_src) == 1: vocabulary_tbl_intersect = '{0}_bigrams'.format(bigrams_src[0]) else: vocabulary_tbl_intersect = '{0}_'.format(bigrams_src[0]) for i in range(len(bigrams_src) - 1): vocabulary_tbl_intersect = '{0}{1}_'.format( vocabulary_tbl_intersect, bigrams_src[i + 1]) vocabulary_tbl_intersect = '{0}bigrams_vw'.format( vocabulary_tbl_intersect) vocabulary = load_common_vocabulary_extend_unigrams( vocabulary_tbl_name, vocabulary_tbl_intersect, 'lemma') print 'done loading vocabulary' # generate tfidf vectors vectorizer, corpus_train_tfidf_vectors = vectorize_corpus( corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], max_labels) print_top_feature_names( results['features_weights'], np.asarray(vectorizer.get_feature_names()), vocabulary_tbl_name + '_' + vocabulary_tbl_intersect, label_names) print vocabulary_tbl_name, '^', vocabulary_tbl_intersect, '(extended unigrams) --> ', 'accuracy ', results[ 'accuracy'] #print vocabulary_tbl_name,'^',vocabulary_tbl_intersect,'(extended unigrams) --> ','precision ',results['precision'],'recall ',results['recall'],'f1 ',results['f1']
def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data, label_names, with_stopwords_removal, use_chi_features, use_raw_tokens, num_components): import numpy as np from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from ng20_globals import max_labels, min_df, min_tf from ng20_vocabulary_loader import load_vocabulary from sklearn.decomposition import TruncatedSVD max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'ng20{0}_lemmas{1}_bigrams{2}_df{3}_tf{4}'.format( raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors vectorizer, corpus_train_tfidf_vectors = vectorize_corpus( corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # apply LSA #print numpy.max(corpus_train_tfidf_vectors) #print numpy.min(corpus_train_tfidf_vectors) lsa = TruncatedSVD(n_components=num_components) lsa.fit(corpus_train_tfidf_vectors) #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose()) corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors) corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], max_labels) print_top_feature_names(results['features_weights'], np.asarray(vectorizer.get_feature_names()), vocabulary_tbl_name, 'lsa_' + label_names) print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[ 'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def run_experiment(corpus_train_data, corpus_test_data, label_names, with_stopwords_removal=False, use_chi_features=False, preprocess_tokens='raw', ngram_size=1, knowledge_set=None): import numpy as np from commons.lemmatizing_tokenizer import RawTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from commons.stemming_tokenizer import RawStemmingTokenizer from ng20_globals import max_labels, min_df, min_tf from ng20_vocabulary_loader import load_vocabulary from ng20_vocabulary_loader import load_common_vocabulary from commons.globals import knowledge_dic if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if preprocess_tokens == 'lemmatize': raw_tokens_pattern = '_raw_lemmas' tokenizer = RawLemmaTokenizer() elif preprocess_tokens == 'stem': raw_tokens_pattern = '_raw_stems' tokenizer = RawStemmingTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawTokenizer() if ngram_size == 1: ngram_pattern = '_unigrams' elif ngram_size == 2: ngram_pattern = '_bigrams' # load vocabulary if knowledge_set != None: vocabulary_tbl_name1 = 'ng20{0}{1}_unigrams{2}_df{3}_tf{4}'.format( raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary_tbl_name2 = 'ng20{0}{1}_bigrams{2}_df{3}_tf{4}'.format( raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary_tbl_intersect = knowledge_dic[knowledge_set.lower()] vocabulary = load_common_vocabulary(vocabulary_tbl_name1, vocabulary_tbl_name2, vocabulary_tbl_intersect, 'stem') vocabulary_tbl_name = vocabulary_tbl_name1, '^', vocabulary_tbl_name2, '^', vocabulary_tbl_intersect else: vocabulary_tbl_name = 'ng20{0}{1}{2}{3}_df{4}_tf{5}'.format( raw_tokens_pattern, chi_features_pattern, ngram_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors vectorizer, corpus_train_tfidf_vectors = vectorize_corpus( corpus_train_data['corpus'], tokenizer, vocabulary, ngram_size) _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, ngram_size) # classify & evaluate results = classify_cv(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], max_labels) print_top_feature_names(results['features_weights'], np.asarray(vectorizer.get_feature_names()), vocabulary_tbl_name, label_names) print vocabulary_tbl_name, ' --> ', 'accuracy', results[ 'accuracy'] #print vocabulary_tbl_name,' --> ','precision ',results['precision'],'recall ',results['recall'],'f1 ',results['f1']