Example #1
0
def build_raw_lemmatized_chi_bigrams_vocabulary(corpus, labels,
                                                vocabulary_src):
    from clef_globals import *
    from clef_vocabulary_loader import load_vocabulary
    from lemmatizing_tokenizer import RawLemmaTokenizer
    from sklearn.feature_extraction.text import CountVectorizer

    tokenizer = RawLemmaTokenizer()
    stop_words = {}
    max_ngram_size = 2

    # load initial vocabulary
    initial_vocabulary_tbl_name = 'clef_2010_{0}_raw_lemmas_bigrams_df{1}_tf{2}'.format(
        vocabulary_src, min_df, min_tf)
    initial_vocabulary = load_vocabulary(initial_vocabulary_tbl_name)

    vocabulary = build_chi_vocabulary(corpus, labels, initial_vocabulary,
                                      tokenizer, stop_words, max_ngram_size,
                                      min_df, min_tf)

    # save to DB
    tbl_name = 'clef_2010_{0}_raw_lemmas_chi_bigrams_df{1}_tf{2}'.format(
        vocabulary_src, min_df, min_tf)
    save_vocabulary(vocabulary, tbl_name)
    print 'done ' + tbl_name
def test_lemmatized_unigrams(corpus, labels, vocabulary_src,
                             with_stopwords_removal, use_chi_features):
    import thread
    from lemmatizing_tokenizer import LemmaTokenizer
    from clef_globals import *
    from clef_vocabulary_loader import load_vocabulary

    max_ngram_size = 1
    tokenizer = LemmaTokenizer()

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'

    # load vocabulary
    vocabulary_tbl_name = 'clef_2010_{0}_lemmas{1}_unigrams{2}_df{3}_tf{4}'.format(
        vocabulary_src, chi_features_pattern, stopwords_pattern, min_df,
        min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    corpus_tfidf_vectors = vectorize_corpus(corpus, tokenizer, vocabulary,
                                            max_ngram_size)

    # classify & evaluate
    classify(labels, corpus_tfidf_vectors, test_set_size, max_labels, 0.5)
    for i in range(0, 101):
        #thread.start_new_thread(classify, (labels,corpus_tfidf_vectors,test_set_size,max_labels,i/100.0))
        classify(labels, corpus_tfidf_vectors, test_set_size, max_labels,
                 i / 100.0)
def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data,
                                     vocabulary_src, with_stopwords_removal,
                                     use_chi_features, use_raw_tokens,
                                     num_components):
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from clef_globals import min_df, min_tf, test_set_size, max_labels
    from clef_vocabulary_loader import load_vocabulary
    from sklearn.decomposition import TruncatedSVD
    from scipy import sparse
    import numpy

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format(
        vocabulary_src, raw_tokens_pattern, chi_features_pattern,
        stopwords_pattern, min_df, min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],
                                                  tokenizer, vocabulary,
                                                  max_ngram_size)
    corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                 tokenizer, vocabulary,
                                                 max_ngram_size)

    # apply LSA
    #print numpy.max(corpus_train_tfidf_vectors)
    #print numpy.min(corpus_train_tfidf_vectors)
    lsa = TruncatedSVD(n_components=num_components)
    lsa.fit(corpus_train_tfidf_vectors)
    #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose())
    corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors)
    corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       test_set_size, max_labels)

    print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[
        'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def test_lemmatized_bigrams(corpus_train_data, corpus_test_data,
                            vocabulary_src, with_stopwords_removal,
                            use_chi_features, use_raw_tokens):
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from clef_globals import min_df, min_tf, test_set_size, max_labels
    from clef_vocabulary_loader import load_vocabulary

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format(
        vocabulary_src, raw_tokens_pattern, chi_features_pattern,
        stopwords_pattern, min_df, min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],
                                                  tokenizer, vocabulary,
                                                  max_ngram_size)
    corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                 tokenizer, vocabulary,
                                                 max_ngram_size)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       test_set_size, max_labels)

    print vocabulary_tbl_name, ' --> ', 'precision ', results[
        'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
Example #5
0
def build_stemmed_unigrams_stopwords_vocabulary(corpus, labels, stop_words,
                                                vocabulary_src):
    from clef_globals import *
    from clef_vocabulary_loader import load_vocabulary
    from stemming_tokenizer import StemmingTokenizer

    tokenizer = StemmingTokenizer()
    max_ngram_size = 1
    # load initial vocabulary
    initial_vocabulary_tbl_name = 'clef_2010_{0}_stems_unigrams_stopwords_df{1}_tf{2}'.format(
        vocabulary_src, min_df, min_tf)
    initial_vocabulary = load_vocabulary(initial_vocabulary_tbl_name)

    vocabulary = build_chi_vocabulary(corpus, labels, initial_vocabulary,
                                      tokenizer, stop_words, max_ngram_size,
                                      min_df, min_tf)

    # save to DB
    tbl_name = 'clef_2010_{0}_stems_chi_unigrams_stopwords_df{1}_tf{2}'.format(
        vocabulary_src, min_df, min_tf)
    save_vocabulary(vocabulary, tbl_name)
    print 'done ' + tbl_name
Example #6
0
def test_lemmatized_unigrams(corpus_train_data,corpus_test_data,vocabulary_src,with_stopwords_removal,use_chi_features,use_raw_tokens):
    from lemmatizing_tokenizer import LemmaTokenizer
    from lemmatizing_tokenizer import RawLemmaTokenizer
    from clef_globals import *
    from clef_vocabulary_loader import load_vocabulary
    
    max_ngram_size = 1
    
    if with_stopwords_removal==False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features==False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens==False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()
    
    # load vocabulary
    vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_unigrams{3}_df{4}_tf{5}'.format(vocabulary_src,raw_tokens_pattern,chi_features_pattern,stopwords_pattern,min_df,min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],tokenizer,vocabulary,max_ngram_size)
    corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],tokenizer,vocabulary,max_ngram_size)
    
    # classify & evaluate    
    # classify & evaluate
    for i in range(40,56):
        classify(corpus_train_tfidf_vectors,corpus_train_data['labels'],
                       corpus_test_tfidf_vectors,corpus_test_data['labels'],
                       test_set_size,max_labels,
                       i/100.0)