def build_raw_lemmatized_chi_bigrams_vocabulary(corpus, labels, vocabulary_src): from clef_globals import * from clef_vocabulary_loader import load_vocabulary from lemmatizing_tokenizer import RawLemmaTokenizer from sklearn.feature_extraction.text import CountVectorizer tokenizer = RawLemmaTokenizer() stop_words = {} max_ngram_size = 2 # load initial vocabulary initial_vocabulary_tbl_name = 'clef_2010_{0}_raw_lemmas_bigrams_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) initial_vocabulary = load_vocabulary(initial_vocabulary_tbl_name) vocabulary = build_chi_vocabulary(corpus, labels, initial_vocabulary, tokenizer, stop_words, max_ngram_size, min_df, min_tf) # save to DB tbl_name = 'clef_2010_{0}_raw_lemmas_chi_bigrams_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) save_vocabulary(vocabulary, tbl_name) print 'done ' + tbl_name
def test_lemmatized_unigrams(corpus, labels, vocabulary_src, with_stopwords_removal, use_chi_features): import thread from lemmatizing_tokenizer import LemmaTokenizer from clef_globals import * from clef_vocabulary_loader import load_vocabulary max_ngram_size = 1 tokenizer = LemmaTokenizer() if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' # load vocabulary vocabulary_tbl_name = 'clef_2010_{0}_lemmas{1}_unigrams{2}_df{3}_tf{4}'.format( vocabulary_src, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors corpus_tfidf_vectors = vectorize_corpus(corpus, tokenizer, vocabulary, max_ngram_size) # classify & evaluate classify(labels, corpus_tfidf_vectors, test_set_size, max_labels, 0.5) for i in range(0, 101): #thread.start_new_thread(classify, (labels,corpus_tfidf_vectors,test_set_size,max_labels,i/100.0)) classify(labels, corpus_tfidf_vectors, test_set_size, max_labels, i / 100.0)
def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data, vocabulary_src, with_stopwords_removal, use_chi_features, use_raw_tokens, num_components): from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from clef_globals import min_df, min_tf, test_set_size, max_labels from clef_vocabulary_loader import load_vocabulary from sklearn.decomposition import TruncatedSVD from scipy import sparse import numpy max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format( vocabulary_src, raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # apply LSA #print numpy.max(corpus_train_tfidf_vectors) #print numpy.min(corpus_train_tfidf_vectors) lsa = TruncatedSVD(n_components=num_components) lsa.fit(corpus_train_tfidf_vectors) #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose()) corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors) corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], test_set_size, max_labels) print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[ 'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def test_lemmatized_bigrams(corpus_train_data, corpus_test_data, vocabulary_src, with_stopwords_removal, use_chi_features, use_raw_tokens): from commons.lemmatizing_tokenizer import LemmaTokenizer from commons.lemmatizing_tokenizer import RawLemmaTokenizer from clef_globals import min_df, min_tf, test_set_size, max_labels from clef_vocabulary_loader import load_vocabulary max_ngram_size = 2 if with_stopwords_removal == False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features == False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens == False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format( vocabulary_src, raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df, min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'], tokenizer, vocabulary, max_ngram_size) # classify & evaluate results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'], corpus_test_tfidf_vectors, corpus_test_data['labels'], test_set_size, max_labels) print vocabulary_tbl_name, ' --> ', 'precision ', results[ 'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def build_stemmed_unigrams_stopwords_vocabulary(corpus, labels, stop_words, vocabulary_src): from clef_globals import * from clef_vocabulary_loader import load_vocabulary from stemming_tokenizer import StemmingTokenizer tokenizer = StemmingTokenizer() max_ngram_size = 1 # load initial vocabulary initial_vocabulary_tbl_name = 'clef_2010_{0}_stems_unigrams_stopwords_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) initial_vocabulary = load_vocabulary(initial_vocabulary_tbl_name) vocabulary = build_chi_vocabulary(corpus, labels, initial_vocabulary, tokenizer, stop_words, max_ngram_size, min_df, min_tf) # save to DB tbl_name = 'clef_2010_{0}_stems_chi_unigrams_stopwords_df{1}_tf{2}'.format( vocabulary_src, min_df, min_tf) save_vocabulary(vocabulary, tbl_name) print 'done ' + tbl_name
def test_lemmatized_unigrams(corpus_train_data,corpus_test_data,vocabulary_src,with_stopwords_removal,use_chi_features,use_raw_tokens): from lemmatizing_tokenizer import LemmaTokenizer from lemmatizing_tokenizer import RawLemmaTokenizer from clef_globals import * from clef_vocabulary_loader import load_vocabulary max_ngram_size = 1 if with_stopwords_removal==False: stopwords_pattern = '' else: stopwords_pattern = '_stopwords' if use_chi_features==False: chi_features_pattern = '' else: chi_features_pattern = '_chi' if use_raw_tokens==False: raw_tokens_pattern = '' tokenizer = LemmaTokenizer() else: raw_tokens_pattern = '_raw' tokenizer = RawLemmaTokenizer() # load vocabulary vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_unigrams{3}_df{4}_tf{5}'.format(vocabulary_src,raw_tokens_pattern,chi_features_pattern,stopwords_pattern,min_df,min_tf) vocabulary = load_vocabulary(vocabulary_tbl_name) # generate tfidf vectors corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],tokenizer,vocabulary,max_ngram_size) corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],tokenizer,vocabulary,max_ngram_size) # classify & evaluate # classify & evaluate for i in range(40,56): classify(corpus_train_tfidf_vectors,corpus_train_data['labels'], corpus_test_tfidf_vectors,corpus_test_data['labels'], test_set_size,max_labels, i/100.0)