Example #1
0
    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """
Example #2
0
def generate_min_sample_size_report(data_dir):
    sample_sizes = [100, 200, 500, 1000, 2000, 3000]

    print('Reading data...')
    inputs = dict()
    for min_sample in sample_sizes:
        inputs[min_sample] = _read_data(data_dir,
                                        {'min_sample_size': min_sample})

    print('Analyzing vocabulary...')
    vocab_size = dict()
    for min_sample in sample_sizes:
        print('Getting vocabulary size for sample size {}'.format(min_sample))
        data = inputs[min_sample]
        cv = CountVectorizer(
            max_df=.95,
            min_df=.0001,
            token_pattern=r"(?u)\b[A-ZÅÄÖa-zåäö][A-ZÅÄÖa-zåäö]+\b")
        cv.fit(data.train.inputs, data.train.targets)
        non_empty_train = 0
        for sample in data.train.inputs:
            if any(token in cv.vocabulary_
                   for token in cv.build_tokenizer()(sample)):
                non_empty_train += 1
        non_empty_test = 0
        for sample in data.test.inputs:
            if any(token in cv.vocabulary_
                   for token in cv.build_tokenizer()(sample)):
                non_empty_test += 1
        vocab_size[min_sample] = (len(cv.vocabulary_), len(data.train.targets),
                                  len(data.test.targets),
                                  non_empty_train / len(data.train.targets),
                                  non_empty_test / len(data.test.targets))
    for k, v in vocab_size.items():
        print(k, v)

    print('Testing accuracy...')
    accuracy_results = pd.DataFrame(
        columns=['Model', 'Min Sample', 'Accuracy'])
    for min_sample in sample_sizes:
        data = inputs[min_sample]
        for model_type in [
                Model.SGD_CLASSIFER, Model.NAIVE_BAYES, Model.MLP_CLASSIFER
        ]:
            print('Running model {} for sample size {}'.format(
                model_type, min_sample))
            model = Model(model_type=model_type, verbose=False)
            model.train(data.train)
            accuracy = model.test(data.test)
            accuracy_results.loc[len(accuracy_results)] = [
                model_type, min_sample, accuracy
            ]
            print('Got accuracy {}'.format(accuracy))
    print(accuracy_results.sort_values(['Model', 'Min Sample']))
def dump_sentences():
    corpus = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    docs = corpus.data
    labels = corpus.target
    label_names = corpus.target_names
    vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
    preprocess = vectorizer.build_preprocessor()
    tokenize = vectorizer.build_tokenizer()
    
    def words(doc):
        p = preprocess(doc)
        return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p))
    
    doccount = 0
    vocab = set()
    with open('20news.txt', 'w') as f:
        for doc, lbl in zip(docs, labels):
            w = words(doc)
            print >> f, label_names[lbl]
            print >> f, w
            doccount += 1
            vocab.update(w.split(' '))
    
    print 'Number of documents:', doccount
    print 'Number of unique words:', len(vocab)
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
def define_vocabulary_inner(doc,
                            stopwords=None,
                            token_pattern=r"(?u)[A-Za-z\?\!\-\.']+",
                            ngram_range=(1, 2),
                            min_dfreq=1e-5,
                            max_dfreq=0.9):
    """
    Tokenize the document and attach POS tag.
    Also return the noun phrases from the document.

    Parameters
    ----------
    doc : list[str]
    stemmer_type : str

    Returns
    -------
    list[list[tup[str]]], set, defaultdict

    """
    # This countvectorizer is used only for creating vocabulary
    cntvec = CountVectorizer(ngram_range=ngram_range,
                             stop_words=stopwords,
                             min_df=min_dfreq,
                             max_df=max_dfreq,
                             token_pattern=token_pattern)
    cntvec.fit(doc)
    vocabulary = cntvec.vocabulary_
    stopwords_all = stopwords.union(cntvec.stop_words_)
    tokenizer = cntvec.build_tokenizer()

    return vocabulary, stopwords_all, tokenizer
Example #6
0
def vectorize_by_term(queries, num_features, vocab=None):
    """
    Converts each query to a matrix where row i is a one hot representation
    of the (i+1)th term in the query.

    :param queries: list of queries
    :param num_features: size of feature vector
    :return: Row-wise data matrix. #rows = #queries and #cols = num_features
    """
    vectorizer = CountVectorizer(min_df=0,
                                 tokenizer=TOKENIZER,
                                 vocabulary=vocab)
    vectorizer.fit(queries)
    tokenize = vectorizer.build_tokenizer()
    X = []
    lengths = []
    for query in queries:
        vector = []
        for token in tokenize(query):
            vector.append(np.argmax(vectorizer.transform([token]).todense()))
        if len(vector) <= num_features:
            vector += [0] * (num_features - len(vector))
        else:
            vector = vector[:num_features]
        one_hot_mat = np.zeros((len(vector), len(vectorizer.vocabulary_)))
        one_hot_mat[np.arange(len(vector)), np.array(vector)] = 1
        X.append(one_hot_mat)
    return np.array(X), vectorizer.vocabulary_
def test_classfier(test, vocabulary, classifiers):
    vectorizer = CountVectorizer()
    correct = 0
    count = [0, 0, 0, 0, 0]
    for phrase in test:
        f = []
        words = phrase[0]
        vector = [0] * (len(classifiers[0]) - 1)
        tokens = vectorizer.build_tokenizer()(words)
        for token in tokens:
            if token in vocabulary:
                vector[vocabulary[token]] += 1
        x = np.array([1] + vector)
        for classifier in classifiers:
            f.append(x.dot(classifier))

        estimate_phrase_class = 0
        for i in range(len(f)):
            if f[i] > 0:
                estimate_phrase_class = i + 1

        count[estimate_phrase_class] += 1

        true_phrase_class = int(phrase[1])

        if (estimate_phrase_class == true_phrase_class):
            correct += 1

    print("Correct: " + str(correct) + "/" + str(len(test)))
    print(correct / len(test))
    print(count)
Example #8
0
def build_text_processor(
    tokenize=True,
    lowercase=True,
    strip_accents='unicode',
    **kwargs,
):
    """ Generates a text preprocessor from sklearn CountVectorizer tools

    It is based on sklearn CountVectorizer functionalities.
    tokenize means that the input string will be tokenized as words before
    being glued back with single spaces. Its purpose is to handle
    whitespaces (newlines, tabs, multiple spaces, ...) and punctuation.
    kwargs are directly passed to CountVectorizer constructor, and will
    serve to process the texts. Most useful args are 'strip_accent' and
    'lowercase'.
    """
    preprocessor_countvect = CountVectorizer(
        lowercase=lowercase,
        strip_accents=strip_accents,
        **kwargs,
    )
    preprocessor = preprocessor_countvect.build_preprocessor()
    tokenizer = preprocessor_countvect.build_tokenizer()
    if tokenize:

        def transformer(x):
            return (' '.join(tokenizer(preprocessor(x))))
    else:
        transformer = preprocessor
    return (transformer)
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print('Loading dataset, 80% for training, 20% for testing...')
    movie_reviews = load_files(dataset_dir_name)
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print('Feature selection...')
    print('fs method:' + fs_method, 'fs num:' + str(fs_num))
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]

    print('Building VSM model...')
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)

    clf = MultinomialNB().fit(doc_train_vec,
                              doc_class_list_train)  # µ÷ÓÃMultinomialNB·ÖÀàÆ÷
    doc_test_predicted = clf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)
    print('Accuracy: ', acc)

    return acc
Example #10
0
def get_inference_vectorizer(article_ids=None, sections_of_interest=None, vocabulary_file=None):

    # if article_ids is None, will use all articles
    # in the CSV passed to the read_in_articles method.
    articles = read_in_articles(article_ids=article_ids)
    raw_texts = [extract_raw_text(article, sections_of_interest) for article in articles]

    # we also use the prompts text to construct our vectorizer
    prompts = read_prompts()
    raw_prompt_text = " ".join(extract_text_from_prompts(prompts))

    raw_texts.append(raw_prompt_text)

    # there is at least one prompt with tokens short enough that CountVectorizer's default destroys it, so we allow any single character through.
    if vocabulary_file is not None:
        with open(vocabulary_file, 'r') as vf:
            vocab = [line.strip() for line in vf]
        vectorizer = CountVectorizer(vocabulary=vocab, token_pattern=r"\b\w+\b")
        print("Loaded {} words from vocab file {}".format(len(vocab), vocabulary_file))
    else:
        vectorizer = CountVectorizer(max_features=20000, token_pattern=r"\b\w+\b")
    vectorizer.fit(raw_texts)
    tokenizer = vectorizer.build_tokenizer() 

    str_to_idx = vectorizer.vocabulary_
    str_to_idx[SimpleInferenceVectorizer.PAD] = max(vectorizer.vocabulary_.values())
    str_to_idx[SimpleInferenceVectorizer.UNK] = str_to_idx[SimpleInferenceVectorizer.PAD]+1
    
    # note that for now the vectorizer is fit using only the
    # article texts (i.e., the vocab is based on words in full-texts,
    # not in prompts necessarily).
    return SimpleInferenceVectorizer(str_to_idx, tokenizer)
Example #11
0
def build_avg_fasttext_from_df(save_dir: str, df_path: str,
                               stemming_map_path: str, text_column: str,
                               label_column: str) -> None:
    check_df_and_stemming_paths(df_path, stemming_map_path)
    stemming_map = read_json_as_dict(stemming_map_path)
    document_list, labels = load_text_and_labels(df_path, text_column,
                                                 label_column)
    save_categorical_labels(save_dir, labels, as_numpy=True)

    # Tokenize
    cv = CountVectorizer(tokenizer=lambda text: tokenize_prune_stem(
        text, stemming_map=stemming_map))
    cv_tokenizer = cv.build_tokenizer()
    document_list = [cv_tokenizer(document) for document in document_list]
    # Load FastText and generate average embeddings
    ft_model = _load_pretrained_swahili_fasttext(RES_DIR)
    avg_ft_document_embeddings = _generate_avg_ft_document_embedding(
        ft_model, document_list)
    np.save(os.path.join(save_dir, 'ft-embeddings.npy'),
            avg_ft_document_embeddings)

    # Printouts
    num_docs = avg_ft_document_embeddings.shape[0]
    dims = avg_ft_document_embeddings.shape[1]
    print(f'{num_docs} documents have been embedded into {dims} dims')
    # Save meta-data to disk
    write_to_meta(
        data_meta_path=os.path.join(save_dir, 'meta.json'),
        key_val={
            'embedding_dims': dims,
            'num_docs': len(document_list),
        },
    )
Example #12
0
def get_doc2vec_embeddngs(
    save_dir: str,
    document_list: List[str],
    stemming_map: Dict[str, str],
    num_epochs: int,
    vector_size: int,
    training_regime: int,
) -> np.ndarray:
    # Tokenize
    cv = CountVectorizer(tokenizer=lambda text: tokenize_prune_stem(
        text, stemming_map=stemming_map))
    cv_tokenizer = cv.build_tokenizer()
    document_list = [cv_tokenizer(document) for document in document_list]

    # Convert to TaggedDocument and train
    print('Training Doc2Vec...')
    tagged_document_list = [
        TaggedDocument(doc, [i]) for i, doc in enumerate(document_list)
    ]
    doc2vec_model = _train_doc2vec(
        docs=tagged_document_list,
        vector_size=vector_size,
        num_epochs=num_epochs,
        training_regime=training_regime,
    )
    _save_for_inference(doc2vec_model, os.path.join(save_dir, 'doc2vec.model'))

    return _infer_document_embeddings(doc2vec_model, document_list)
Example #13
0
def tokenize_data(text_data):
    nltk.download('stopwords')
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
    tokenizer = vectorizer.build_tokenizer()

    tokens_list = tokenizer(text_data)

    return tokens_list
Example #14
0
 def nlp(self, model):
     if model == "default":
         cv = CountVectorizer(lowercase=self.lower_case)
         sk_word_tokenize = cv.build_tokenizer()
         sk_preprocesser = cv.build_preprocessor()
         self._nlp = lambda doc: sk_word_tokenize(sk_preprocesser(doc))
     else:
         self._nlp = model
Example #15
0
def remove_stop_word_tokenizer(s):
    """
    Custom tokenizer
    """
    count_vect = CountVectorizer()
    default_tokenizer_function = count_vect.build_tokenizer()
    words = default_tokenizer_function(s)
    words = list(w for w in words if w.lower() not in stopwordslist)
    return words
Example #16
0
def word_loss_stat(CountVectorizer, tweets, y, tweets_, y_):
    CountVectorizer.fit(tweets)
    tokenizer = CountVectorizer.build_tokenizer()
    loss = []
    for i in range(len(tweets_)):
        total = len(tokenizer(tweets_[i]))
        residue = sum(CountVectorizer.transform(tweets_[i]))
        loss.append((total, residue, 1-(residue*1.0)/total))
    return loss
Example #17
0
class Recommender:
    def __init__(self, followed_tweets, users_tweets):
        self.followed_tweets = followed_tweets
        self.users_tweets = users_tweets
        self.vectorizer = CountVectorizer()
        self.counts = []

    def generate(self, number_of_recommendations, followed_accounts,
                 how_many_days_ago):

        #generate a count list
        counts = [self.count_bag(tweet) for tweet in self.followed_tweets]

        #return sorted home timeline and sorted list of count
        return {
            "recommended_tweets":
            sorted(self.followed_tweets, key=self.count_bag, reverse=True),
            "counts":
            sorted(counts, reverse=True)
        }

    def count_bag(self, item):

        # generate a list of stop words
        stop_words = [
            word
            for word in CountVectorizer(stop_words='english').get_stop_words()
        ]
        stop_words.append('rt')  #retweets
        stop_words.append('https')  #urls

        # fetch the text of user's own timeline
        users_tweets_text = [tweet['text'] for tweet in self.users_tweets]

        # for user's own timeline, tokenize words, convert to lowercase, filter out stop words
        #generate a list of terms of the user's own timeline
        tokenize = self.vectorizer.build_tokenizer()
        terms = [
            word.lower() for sentence in users_tweets_text
            for word in tokenize(sentence)
            if len(word.lower()) > 1 and word.lower() not in stop_words
        ]

        count = 0
        #for each tweet in home timeline, tokenize words, convert to lowercase, filter out stop words
        words = [
            word.lower() for word in tokenize(item['text'])
            if len(word.lower()) > 1 and word.lower() not in stop_words
        ]

        #the for each word in words, if it exists in the terms of user's own timeline, the increase count by one
        #at last
        for word in words:
            if word in terms:
                count += 1

        return count
def read_texts(tarfname, dname):
    """Read the data from the homework data file.

		Given the location of the data archive file and the name of the
		dataset (one of brown, reuters, or gutenberg), this returns a
		data object containing train, test, and dev data. Each is a list
		of sentences, where each sentence is a sequence of tokens.
		"""
    import tarfile
    tar = tarfile.open(tarfname, "r:gz", errors='replace')
    for member in tar.getmembers():
        if dname in member.name and ('train.txt') in member.name:
            print('\ttrain: %s' % (member.name))
            train_txt = unicode(tar.extractfile(member).read(),
                                errors='replace')
        elif dname in member.name and ('test.txt') in member.name:
            print('\ttest: %s' % (member.name))
            test_txt = unicode(tar.extractfile(member).read(),
                               errors='replace')
        elif dname in member.name and ('dev.txt') in member.name:
            print('\tdev: %s' % (member.name))
            dev_txt = unicode(tar.extractfile(member).read(), errors='replace')

    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(train_txt.split("\n"))
    tokenizer = count_vect.build_tokenizer()

    class Data:
        pass

    data = Data()
    data.train = []
    for s in train_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.train.append(toks)
    data.test = []
    for s in test_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.test.append(toks)
    data.dev = []
    for s in dev_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.dev.append(toks)
    data.portion = []
    for s in dev_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.dev.append(toks)

    print(dname, " read.", "train:", len(data.train), "dev:", len(data.dev),
          "test:", len(data.test))
    return data
Example #19
0
def test():
    vectorizer_1 = CountVectorizer()

    tokenizer = vectorizer_1.build_tokenizer()
    stemmer = PorterStemmer()
    matrix, groups_map = construct_matrix_and_group(vectorizer_1)
    print(matrix.shape)
    _, query_group = divide_data_in_group(groups_map, 100)
    for x in np.arange(0.1, 1.0, 0.1):
        print(calculate_precision_and_recall(query_group, matrix, x))
Example #20
0
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)
    # 对数据集进行划分,80%用来进行训练,20%进行测试,并把对应的类别进行标注
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()

    # doc_term_list_train:得到训练数据集中的每个文档进行分词的数组
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]

    # doc_class_list_train:每个文档对应的类别编号的数组
    term_set_fs = FeatureSelections.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]
    print "term_set_fs length %s " % (len(term_set_fs))

    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)
    # 朴素贝叶斯分类器
    # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类器
    # doc_test_predicted = clf.predict(doc_test_vec)

    # SVM分类器
    svclf = SVC(kernel='linear')
    svclf.fit(doc_train_vec, doc_class_list_train)
    doc_test_predicted = svclf.predict(doc_test_vec)

    # KNN
    # knnclf = KNeighborsClassifier()  # default with k=5
    # knnclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = knnclf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)

    print 'Accuracy: ', acc

    from sklearn.metrics import classification_report
    print 'precision,recall,F1-score如下:》》》》》》》》'
    print classification_report(doc_test_predicted, doc_class_list_test)

    return acc
Example #21
0
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None,
           historian_name_last=None, document_type=None):


    side_question, side_answer = get_sides(side_question, side_answer)

    vectorizer = CountVectorizer(max_features= 100000)

    docs = document_iterator(type=type, side_question=side_question, format='docs_only')
    vectorizer.fit(docs)

    features =  vectorizer.get_feature_names()
    vocabulary = {features[i]:i for i in range(len(features))}

    word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int)

    docs = document_iterator(type=type, side_question=side_question, historian_name_last=historian_name_last, document_type=document_type)
    tokenizer = vectorizer.build_tokenizer()
    for doc in docs:
        year = int(doc[0][:4])
        document = doc[1].lower()
        for token in tokenizer(document):
            word_counts[year-year_start, vocabulary[token]] += 1



    totals = np.sum(word_counts, axis=1)

    word_counts =  word_counts[:,vocabulary[term]]
    word_frequencies = 1.0 * word_counts / totals

    viz_formatting = {'Plaintiff': 's',
                      'Defendant': '^'}

    if type == 'A':
        label = '{} in Answers by {} Witnesses.'.format( term, side_answer)
        viz_format = 'b{}'.format(viz_formatting[side_answer])
    if type == 'Q':
        label = '{} in Questions by {} Lawyers.'.format(term, side_question)
        viz_format = 'r{}'.format(viz_formatting[side_question])

    return {
        'year_start': year_start,
        'year_end': year_end,
        'term': term,
        'word_counts': word_counts,
        'word_frequencies': word_frequencies,
        'label': label,
        'viz_format': viz_format,
        'side_question': side_question,
        'side_answer': side_answer,
        'type': type,
        'historian_name_last': historian_name_last
    }
Example #22
0
 def __init__(self, mask_dates, max_length=MAX_LENGTH):
     # Steal the defaul preprocessor and tokenizer from sklearn
     v = CountVectorizer()
     self.max_length = max_length
     self.dat = re.compile(r'\b\d{1,2}\-?[a-z]{3}\-?\d{2,4}\b')
     if mask_dates:
         self.preprocess = lambda x: self.dat.sub('<DATE>', str(x).lower())
     else:
         self.preprocess = v.build_preprocessor()
     self.tokenize = v.build_tokenizer()
     self.is_num = re.compile(r'\b\d+\b')  # isolated numbers
Example #23
0
def build_vectorizer(vectorizer_type, use_stemmer):
    origin = CountVectorizer(
    ) if vectorizer_type == "count" else TfidfVectorizer()
    if vectorizer_type == "count":
        if use_stemmer:
            tokenizer = create_tokenizer(origin.build_tokenizer(),
                                         PorterStemmer())
            return CountVectorizer(tokenizer=tokenizer)
        else:
            return CountVectorizer()

    if vectorizer_type == "tf-idf":
        if use_stemmer:
            tokenizer = create_tokenizer(origin.build_tokenizer(),
                                         PorterStemmer())
            return TfidfVectorizer(tokenizer=tokenizer)
        else:
            return TfidfVectorizer()

    return None
Example #24
0
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None):


    side_question, side_answer = get_sides(side_question, side_answer)

    vectorizer = CountVectorizer(max_features= 100000)

    docs = document_iterator(type=type, side_question=side_question, format='docs_only')
    vectorizer.fit(docs)

    features =  vectorizer.get_feature_names()
    vocabulary = {features[i]:i for i in range(len(features))}

    word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int)

    docs = document_iterator(type=type, side_question=side_question)
    tokenizer = vectorizer.build_tokenizer()
    for doc in docs:
        year = int(doc[0][:4])
        document = doc[1].lower()
        for token in tokenizer(document):
            word_counts[year-year_start, vocabulary[token]] += 1



    totals = np.sum(word_counts, axis=1)

    word_counts =  word_counts[:,vocabulary[term]]
    word_frequencies = 1.0 * word_counts / totals

    viz_formatting = {'Plaintiff': 's',
                      'Defendant': '^'}

    if type == 'A':
        label = '{} in Answers by {} Witnesses.'.format( term, side_answer)
        viz_format = 'b{}'.format(viz_formatting[side_answer])
    if type == 'Q':
        label = '{} in Questions by {} Lawyers.'.format(term, side_question)
        viz_format = 'r{}'.format(viz_formatting[side_question])

    return {
        'year_start': year_start,
        'year_end': year_end,
        'term': term,
        'word_counts': word_counts,
        'word_frequencies': word_frequencies,
        'label': label,
        'viz_format': viz_format,
        'side_question': side_question,
        'side_answer': side_answer,
        'type': type
    }
Example #25
0
def tokenize_data_list(document_list):
    nltk.download('stopwords')
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
    tokenizer = vectorizer.build_tokenizer()
    document_tokens_list = []

    for i in range(0, len(document_list)):
        text = document_list[i]
        text = text.lower()
        tokens = tokenizer(text)
        document_tokens_list.append(tokens)

    return document_tokens_list
Example #26
0
def main():
    corpus = [
        'These are the first documents I, Corin Goldberg, work at Google.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?',
    ]

    vectorizer = CountVectorizer(
        tokenizer=NamedEntityDetailedTokenizer(LanguageProcessor()))
    analyze = vectorizer.build_tokenizer()

    for cell in corpus:
        print(analyze(cell))
Example #27
0
class Vectorizer(object):
    def __init__(self):
        self.count_vec = TfidfVectorizer(binary = True,
                                         ngram_range = (1, 3),
                                         tokenizer = Tokenizer())

        self.last_vec = CountVectorizer(binary = True, ngram_range = (1, 1), tokenizer = Tokenizer())


    def collect_last_term(self, X):
        X_last = list()
        tokens = self.last_vec.build_tokenizer()
        _logger.debug("Extracting last term for each sentence")
        for sent in X:
            X_last.append(tokens(sent)[-1])
        _logger.debug("Fitting last-term vectorizer")
        return X_last
        

    def fit(self, X, y = None):
        _logger.debug("Fitting count vectorizer")
        self.count_vec.fit(X)
        X_last = self.collect_last_term(X)
        self.last_vec.fit(X_last)
        return self

    def transform(self, X, y = None):
        #return self.count_vec.transform(X)
        _logger.debug("Doing tfidf transform")
        Xc = self.count_vec.transform(X)

        X_last = self.collect_last_term(X)
        _logger.debug("Doing last term transform")
        Xl = self.last_vec.transform(X_last)
        _logger.debug("stacking features")
        ret = sparse.hstack([Xc, Xl])
        
        tokens = self.count_vec.build_tokenizer()
        l = list()
        for sent in X:
            terms = tokens(sent)
            l.append(1 if  ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0)

        l = np.array(l)
        l.shape = len(l), 1
        ret = sparse.hstack([ret, l])
        _logger.debug("vectorization transform done")

        return ret
def parse_file(args: Dict) -> None:

    stemmer = SnowballStemmer("english")
    cv = CountVectorizer()
    tokenizer = cv.build_tokenizer()
    out_file_name = os.path.basename(args["lat_file_path"])
    out_file_path = os.path.join(args["out_dump_paths"]["snbl"], out_file_name)
    logger.info("parsing {}".format(args["lat_file_path"]))
    with open(args["lat_file_path"], "r") as ifp, open(out_file_path,
                                                       "w") as ofp:
        for line in ifp:
            page = json.loads(line)
            page = filter_sections(page)
            page = add_stems(page, stemmer, tokenizer)
            ofp.write("{}\n".format(json.dumps(page)))
def main():
    cats = ['alt.atheism', 'soc.religion.christian']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
    newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
    vectorizer = CountVectorizer(lowercase=False)
    train_vectors = vectorizer.fit_transform(newsgroups_train.data)
    test_vectors = vectorizer.transform(newsgroups_test.data)
    terms = np.array(list(vectorizer.vocabulary_.keys()))
    data = newsgroups_train.data
    train_labels = newsgroups_train.target
    test_labels = newsgroups_test.target
    terms = np.array(list(vectorizer.vocabulary_.keys()))
    indices = np.array(list(vectorizer.vocabulary_.values()))
    inverse_vocabulary = terms[np.argsort(indices)]
    classifier = linear_model.LogisticRegression(fit_intercept=False)
    classifier.fit(train_vectors, train_labels)
    tokenizer = vectorizer.build_tokenizer()
    doc_file = open('docs.json', 'w')
    jsonz = {}
    jsonz['docs'] = []
    for i, doc in enumerate(newsgroups_test.data):
        temp = {}
        temp['text'] = ' \n '.join(
            map(lambda x: ' '.join(tokenizer(x)), doc.split('\n'))).split(' ')
        temp['true_class'] = int(test_labels[i])
        temp['prediction'] = round(
            classifier.predict_proba(test_vectors[i])[0][1], 2)
        jsonz['docs'].append(temp)
    #jsonz['docs'] = sorted(jsonz['docs'], key= lambda x:abs(x['true_class'] - x['prediction']), reverse=True)
    ww = {}
    for word, weight in zip(inverse_vocabulary, classifier.coef_[0]):
        ww[word] = weight
    jsonz['weights'] = ww
    jsonz['accuracy'] = round(
        metrics.accuracy_score(newsgroups_test.target,
                               classifier.predict(test_vectors)), 3)
    jsonz['feature_statistics'] = {}
    train_v = train_vectors.toarray()
    for word, index in vectorizer.vocabulary_.iteritems():
        nz = train_v[:, index].nonzero()[0]
        prob = float(len(nz)) / train_labels.shape[0]
        if prob > .01:
            jsonz['feature_statistics'][word] = {}
            jsonz['feature_statistics'][word]['freq'] = round(prob, 2)
            jsonz['feature_statistics'][word]['distribution'] = round(
                np.mean(train_labels[nz]), 2)

    json.dump(jsonz, doc_file)
Example #30
0
def textToTokens(text):
    """Converts input string to a corpus of tokenized sentences.
    Assumes that the sentences are divided by newlines (but will ignore empty sentences).
    You can use this to try out your own datasets, but is not needed for reading the homework data.
    """
    corpus = []
    sents = text.split("\n")
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    count_vect.fit(sents)
    tokenizer = count_vect.build_tokenizer()
    for s in sents:
        toks = tokenizer(s)
        if len(toks) > 0:
            corpus.append(toks)
    return corpus
def getDatas(dataset_dir_name):
    movie_reviews = load_files(dataset_dir_name)

    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)

    #word_tokenizer 应用于英文文档时,按照空格将词分开,每篇文档转变为一个词向量,用于构建词频矩阵。所以应用于中文分词时
    vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
    word_tokenizer = vectorizer.build_tokenizer()


    #每个文档是一个词list
    doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
    doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)


    return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
Example #32
0
def get_vectorizer():
    vectorizer = CountVectorizer(
        strip_accents='unicode',
        stop_words='english',
        ngram_range=(1, 1),  #default (1,1)
        token_pattern=r"(?u)\b(\d*[a-zA-Z]+\d*)+\b",  #default r"(?u)\b\w\w+\b"
        min_df=3)
    better_tokenizer = build_lemmatizing_tokenizer(
        vectorizer.build_tokenizer())
    better_vectorizer = CountVectorizer(
        tokenizer=better_tokenizer,
        strip_accents='unicode',
        stop_words='english',
        ngram_range=(1, 1),  #default (1,1)
        # token_pattern=r"(?u)\b[A-Za-z_][A-Za-z_]+\b", #tokenizer builtin
        min_df=3)
    return better_vectorizer
Example #33
0
 def tokenize(self,
              pattern=u'(?u)\\b\\w+\\b',
              tokenizer_function=TreebankWordTokenizer().tokenize):
     vec = CountVectorizer(token_pattern=pattern,
                           tokenizer=tokenizer_function)
     vec.fit(self.corpus)
     toker = vec.build_tokenizer()
     tokenized_corpus = [toker(doc) for doc in self.corpus]
     tokenized_corpus = [[word.lower() for word in sent]
                         for sent in tokenized_corpus]
     self.vocabulary = vec.vocabulary_
     self.word_frequencies = freq_dist(tokenized_corpus, self.vocabulary)
     self.tokens = tokenized_corpus
     self.token_integers = [
         to_integer(sent, self.vocabulary) for sent in self.tokens
     ]
     self.token_integers = [np.sum((seq, 1)) for seq in self.token_integers]
class Featurizer:
    def __init__(self):
        self.vectorizer = CountVectorizer(stop_words="english")
        self.filename = "feature.txt"

    def train_feature(self, examples):
        d_new = defaultdict(int)
        
        """
        d = self.features(''.join(examples))
        with open(self.filename,'wb') as fp:
            pickle.dump(d,fp)
        with open(self.filename,'rb') as fp:
            d = pickle.load(fp)
        fp.close()
        """
        d = self.features(''.join(examples))
        for i,j in d.items():
            if isinstance(i, str):
                d_new.update({i:j})
            else:
                d_new.update({''.join(i):j})
      
        return self.vectorizer.fit_transform(d_new)
    
    def features(self, text):
        d = defaultdict(int)
        tokenize = self.vectorizer.build_tokenizer()
        for ii in tokenize(text):
            d[lancaster_stem(ii)]+=1
        d.update(self.bigram_feature(text))
        return d
    
    def bigram_feature(self, text):
        bigram_colloc_finder = BigramCollocationFinder.from_words(text)
        bigrams = bigram_colloc_finder.nbest(BigramAssocMeasures.chi_sq,100)
        return dict([(bigram, True) for bigram in itertools.chain(text, bigrams)])

    def test_feature(self, examples):
        return self.vectorizer.transform(examples)

    def show_top10(self, classifier, categories):
        feature_names = np.asarray(self.vectorizer.get_feature_names())
        for i, category in enumerate(categories):
            top10 = np.argsort(classifier.coef_[i])[-10:]
            print("%s: %s" % (category, " ".join(feature_names[top10])))
Example #35
0
def gettags(x):

    st = PorterStemmer()
    x = unicode(x, errors='ignore')
    x = x.lower()
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                 stop_words=None,
                                 tokenizer=TreebankWordTokenizer().tokenize)
    #^([a-zA-Z]*|\d+|\W)$
    tokenize = vectorizer.build_tokenizer()
    tokenList = tokenize(x)
    tokenList = [token for token in tokenList if re.match('[a-zA-Z]+', token)]
    lmtzr = WordNetLemmatizer()
    #x=[ for i in tokenList]
    #print [i for i in tokenList]
    tags = nltk.pos_tag([lmtzr.lemmatize(i) for i in tokenList])
    return tags
def run():
    ''' create a product dictionary based on all tokens in the best buy product corpus '''

    soup = BeautifulSoup(open(constants.BESTBUY_PRODUCT_CORPUS_FILE, 'rb'), 'html.parser')
    vectorizer = CountVectorizer(strip_accents='ascii')
    
    tokenizer = vectorizer.build_tokenizer()
    preprocessor = vectorizer.build_preprocessor()

    tokens = set()

    for item in tokenizer(soup.get_text()):
        tokens.add(preprocessor(item))

    with codecs.open(constants.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f:
        for token in tokens:
            f.write(token + '\n')
def get_stems(text):
    # initializes snowball stemmer from nltk
    snowball = SnowballStemmer("english")

    # initializes count vectorizer from sklearn, creating bag of ngrams (ngram_range could use further testing)
    ngram_vectorizer = CountVectorizer('char_wb', ngram_range=(1, 6))

    # delineates words by empty character " " and converts to list format
    tokenized = (ngram_vectorizer.build_tokenizer()(text))

    # applies snowball stemmer so we can account for grammatical discrepancies
    for i in range(len(tokenized)):
        tokenized[i] = snowball.stem(tokenized[i])

    # convert list back to string and return
    text = (" ".join(tokenized))
    return text
    def create_feature_matrix_token_counts(self):
        '''
        Create a n by m matrix of n twitter messages with m features representing
        count of preprocessed, stemmed, tokenized words
        :return: n by m feature matrix of n twitter messages and m features (i.e. word tokens)
        '''

        #Create the basic count vectorizer so that we can copy its preprocessor and tokenizer
        basic_vectorizer = CountVectorizer(stop_words='english')
        preprocessor = basic_vectorizer.build_preprocessor();
        tokenizer = basic_vectorizer.build_tokenizer();

        #Create a stemmer for additional processing after preprocessing and tokenizer
        stemmer = EnglishStemmer()

        #Custom analyzer for Count Vectorizer which stems tokens after preprocessing
        def stemming_analyzer(document):

            if self.filter_numbers:
                return [token for token in map(stemmer.stem, tokenizer(preprocessor(document))) if not vec_tools.number_pattern().search(token)]
            else:
                return map(stemmer.stem, tokenizer(preprocessor(document)))

        if self.uni_bi_gram:
            vectorizer = CountVectorizer(stop_words='english', min_df=2, analyzer="char_wb", ngram_range=(1,2))
        else:
            vectorizer = CountVectorizer(stop_words='english', min_df=self.min_df, analyzer=stemming_analyzer)


        all_twitter_msg_text = [t.msg_text for t in self.twitter_messages]
        all_twitter_msg_polarity = [t.polarity for t in self.twitter_messages]

        if self.filter_url_hashtag_username:
            vec_tools.filter_url_username_hashtag(all_twitter_msg_text)

        self.feature_matrix_token_counts = vectorizer.fit_transform(all_twitter_msg_text)

        if self.select_k_best:
            self.feature_matrix_token_counts = SelectKBest(chi2,self.k).fit_transform(self.feature_matrix_token_counts, all_twitter_msg_polarity)
            self.token_feature_names = [i for i in range(self.feature_matrix_token_counts.shape[1])]
            self.amount_of_token_features = self.feature_matrix_token_counts.shape[1]
        else:
            self.token_feature_names = vectorizer.get_feature_names()
            self.amount_of_token_features = len(self.token_feature_names)

        return self.feature_matrix_token_counts
Example #39
0
def read_texts(tarfname, dname):
    """Read the data from the homework data file.

    Given the location of the data archive file and the name of the
    dataset (one of brown, reuters, or gutenberg), this returns a
    data object containing train, test, and dev data. Each is a list
    of sentences, where each sentence is a sequence of tokens.
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz", errors='replace')
    train_mem = tar.getmember(dname + ".train.txt")
    train_txt = unicode(tar.extractfile(train_mem).read(), errors='replace')
    test_mem = tar.getmember(dname + ".test.txt")
    test_txt = unicode(tar.extractfile(test_mem).read(), errors='replace')
    dev_mem = tar.getmember(dname + ".dev.txt")
    dev_txt = unicode(tar.extractfile(dev_mem).read(), errors='replace')

    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer(ngram_range=(2, 2))
    count_vect.fit(train_txt.split(
        "\n"))  # each sentence in the corpus is devided by '\n'
    tokenizer = count_vect.build_tokenizer()

    class Data:
        pass

    data = Data()
    data.train = []
    for s in train_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.train.append(toks)
    data.test = []
    for s in test_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.test.append(toks)
    data.dev = []
    for s in dev_txt.split("\n"):
        toks = tokenizer(s)
        if len(toks) > 0:
            data.dev.append(toks)
    print(dname, " read.", "train:", len(data.train), "dev:", len(data.dev),
          "test:", len(data.test))
    return data
def main():
  cats = ['alt.atheism', 'soc.religion.christian']
  newsgroups_train = fetch_20newsgroups(subset='train',categories=cats)
  newsgroups_test = fetch_20newsgroups(subset='test',categories=cats)
  vectorizer = CountVectorizer(lowercase=False)
  train_vectors = vectorizer.fit_transform(newsgroups_train.data)
  test_vectors = vectorizer.transform(newsgroups_test.data)
  terms = np.array(list(vectorizer.vocabulary_.keys()))
  data = newsgroups_train.data
  train_labels = newsgroups_train.target
  test_labels = newsgroups_test.target
  terms = np.array(list(vectorizer.vocabulary_.keys()))
  indices = np.array(list(vectorizer.vocabulary_.values()))
  inverse_vocabulary = terms[np.argsort(indices)]
  classifier = linear_model.LogisticRegression(fit_intercept=False)
  classifier.fit(train_vectors,train_labels)
  tokenizer = vectorizer.build_tokenizer()
  doc_file = open('docs.json',  'w')
  jsonz = {}
  jsonz['docs'] = []
  for i, doc in enumerate(newsgroups_test.data):
    temp = {}
    temp['text'] = ' \n '.join(map(lambda x: ' '.join(tokenizer(x)), doc.split('\n'))).split(' ')
    temp['true_class'] = int(test_labels[i])
    temp['prediction'] = round(classifier.predict_proba(test_vectors[i])[0][1], 2)
    jsonz['docs'].append(temp)
  #jsonz['docs'] = sorted(jsonz['docs'], key= lambda x:abs(x['true_class'] - x['prediction']), reverse=True)
  ww = {}
  for word, weight in zip(inverse_vocabulary, classifier.coef_[0]):
    ww[word] = weight
  jsonz['weights'] = ww
  jsonz['accuracy'] = round(metrics.accuracy_score(newsgroups_test.target, classifier.predict(test_vectors)), 3)
  jsonz['feature_statistics'] = {}
  train_v = train_vectors.toarray()
  for word, index in vectorizer.vocabulary_.iteritems():
    nz = train_v[:,index].nonzero()[0]
    prob = float(len(nz)) / train_labels.shape[0]
    if prob > .01:
      jsonz['feature_statistics'][word] = {}
      jsonz['feature_statistics'][word]['freq'] = round(prob, 2)
      jsonz['feature_statistics'][word]['distribution'] = round(np.mean(train_labels[nz]), 2)

  json.dump(jsonz, doc_file)
def dump_reviews():
    download()
    print 'making dataset'
    vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
    preprocess = vectorizer.build_preprocessor()
    tokenize = vectorizer.build_tokenizer()
    
    def dumbascii(thing):
        try:
            thing.encode('ascii', 'replace')
            return True
        except UnicodeDecodeError:
            return False

    def words(doc):
        p = preprocess(doc)
        return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p) if dumbascii(t))
    
    doccount = 0
    vocab = set()
   
    with open('reviews.txt', 'w') as fout:
        for topicdir in DIRS:
            with open(os.path.join(topicdir, POSREV), 'r') as f:
                text = f.read()
            for doc in REVREGEX.findall(text):
                w = words(doc)
                print >> fout, 'positive'
                print >> fout, w
                doccount += 1
                vocab.update(w.split(' '))

            with open(os.path.join(topicdir, NEGREV), 'r') as f:
                text = f.read()
            for doc in REVREGEX.findall(text):
                w = words(doc)
                print >> fout, 'negative'
                print >> fout, w
                doccount += 1
                vocab.update(w.split(' '))
    
    print 'Number of documents:', doccount
    print 'Number of unique words:', len(vocab)
def processFolder(type, name):   
	
	categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
	twenty_train = fetch_20newsgroups(subset=type, shuffle=True, random_state=42, categories=categories, remove=('headers', 'footers', 'quotes'))
	

	#count_vect = CountVectorizer(stop_words="english")
	count_vect = CountVectorizer()
	tokenize = count_vect.build_tokenizer()
	sentences = [tokenize(data) for data in twenty_train.data]
	model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=4)

	iter = 0
	dataSetOutFile = open(name, 'wb')	
	for sentence in sentences:
		currVec = numpy.zeros((200))
		minVec = numpy.zeros((200))
		maxVec = numpy.zeros((200))
		minVecSum = numpy.inf
		maxVecSum = 0
		for word in sentence:
			if word in model:
				currVecSum = numpy.sum(model[word])
				if currVecSum < minVecSum:
					minVec = model[word]
				if currVecSum > maxVecSum:
					maxVec = model[word]
				currVec = currVec + model[word]/len(sentence)
		currVec = numpy.append(currVec, minVec)
		currVec = numpy.append(currVec, maxVec)

		labelInner = twenty_train.target[iter]
		#print labelInner
		contentLabel = (currVec, labelInner)
		pickle.dump(contentLabel, dataSetOutFile)
		iter += 1
	dataSetOutFile.close()
	return #(numpy.array(contentOuter), numpy.array(labelOuter))
class TextTransformer(object):

    from re import sub

    def __init__(self):

        #from nltk.stem.lancaster import LancasterStemmer
        from sklearn.feature_extraction.text import CountVectorizer

        import enchant

        #self.stemmer = LancasterStemmer()
        self._vectorizer = CountVectorizer(strip_accents='ascii')
        self.tokenizer = self._vectorizer.build_tokenizer()
        self.preprocessor = self._vectorizer.build_preprocessor()
        self.spellchecker = enchant.DictWithPWL("en_US",
            pwl=constants.PERSONAL_WORD_DICTIONARY_FILE)


    def transform_text(self, raw_text):
    
        tokens = []
        for token in self.tokenizer(raw_text):
            clean_token = self.preprocessor(token)
            if not self.spellchecker.check(clean_token):
                corrections = self.spellchecker.suggest(clean_token)
                if len(corrections) > 0:
                    clean_token = corrections[0]

            tokens.append(clean_token)

        return ' '.join(tokens)


    def sub_numbers(self, text):
        return sub("[0-9]+", " numbr ", text)
Example #44
0
import sys
import csv
import codecs
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary = True, decode_error = u'ignore', encoding='latin-1', lowercase=True)
word_tokenizer = vectorizer.build_tokenizer()

with codecs.open(sys.argv[1], encoding="latin-1") as cf, codecs.open(sys.argv[2], "w", encoding="latin-1") as of:
    CSV = csv.reader(cf, delimiter=',')
    for i in CSV: 
        try:
            string = word_tokenizer(i[1])
        except:
            continue

        of.write("%s\n" % " ".join(string).lower())
Example #45
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
    except IOError:
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Cheating experiment - use full uncertainty query k words")
    t0 = time.time()
    ### experiment starts
    tx =[]
    tac = []
    tau = []
    for t in range(args.trials):
        trial_accu =[]

        trial_aucs = []

        trial_x_axis = []
        print "*" * 60
        print "Trial: %s" % t

        student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)


            ## add data recent acquired to train
            ## CHANGE: if label is not useful, ignore and do not charge money for it
            useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None])

            # train_indices.extend(query_index)
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

                #count for cost
                ### accumulate the cost of the query
                # query_cost = np.array(spent).sum()
                # current_cost += query_cost
                query_cost = useful_answers[:, 2]
                query_cost = np.sum(query_cost)
                current_cost += query_cost

            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train(train_x, train_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices),
                                                                                            accu,
                                                                                            auc, query_cost,
                                                                                            current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)

                ## partial trial results

                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
            iteration += 1

        # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
    #end trial loop

    accuracies = extrapolate_trials(tac)
    aucs = extrapolate_trials(tau)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)
Example #46
0
 clf_7 = Pipeline([
     ('vect', TfidfVectorizer(
                 stop_words=stop_words,
                 token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
     )),
     ('clf', MultinomialNB(alpha=0.01)),
 ]) 
 
 evaluate_cross_validation(clf_7, news.data, news.target, 5)
 '''
 
 
 
 from sklearn.feature_extraction.text import TfidfTransformer
 transformer = TfidfTransformer()
 
 def my_tokenizer(s):
     return s.split()
 vectorizer = CountVectorizer(tokenizer=my_tokenizer)
 str = 'I am sure some bashers of Pens fans are pretty confused about the lack'
 print vectorizer.build_analyzer()(str)
 print vectorizer.build_tokenizer()(str)
 print vectorizer.build_preprocessor()(str)
 
 s1 = 'rạng sáng nay theo giờ hà_nội danh_hiệu cầu_thủ giá_trị mvp giải mls năm được công_bố tiền_đạo gốc việt_lee_nguyễn ứng_viên sáng_giá không kém đôi ngôi_sao đá giải ngoại_hạng robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bình_chọn dựa số phiếu clb dự mls giới truyền_thông cầu_thủ robbie_keane người số phiếu trận chung_kết mls cup robbie_keane los_angeles_galaxy giành danh_hiệu cầu_thủ giá_trị mls lee_nguyễn được đánh_giá cao bình_chọn ảnh espn lee_nguyễn xếp thứ_ba bình_chọn đạt tổng_số phiếu mùa lee_nguyễn ghi bàn năm pha kiến_tạo cuối giải thi_đấu ấn_tượng vai_trò cầm_trịch lối chơi ghi_bàn cho new_england_revolution vòng play off mls cup tiền vệ_sinh năm ghi thêm hai bàn ba pha kiến_tạo đưa revolution đoạt vô_địch mls khu_vực miền đông giành vé dự chung_kết mls cup đối_đầu đội bóng keane la galaxy tháng lee_nguyễn được hlv jurgen_klinsmann triệu_tập trở_lại tuyển mỹ nhờ phong_độ ấn_tượng mls cựu inter_milan newcastle_utd obafemi_martins đứng thứ_hai số phiếu bầu cầu_thủ clb phiếu bầu clb phiếu bầu truyền thông phiếu bầu cầu thủ tổng robbie_keane la galaxy obafemi_martins seattle_sounders lee_nguyễn new england rev bradley_wright phillips ny  red_bulls tuấn'
 s2 = 'lee_nguyễn trải một năm thi_đấu hoàn_hảo ảnh usa today kết_quả được công_bố trang thông_tin chính_thức ban tổ_chức giải mls phần bình_luận tiền_vệ công lee_nguyễn đoạn lọt danh_sách bầu_chọn cuối_cùng cho danh_hiệu cầu_thủ giá_trị mls cho thấy lee_nguyễn một bước đột_phá sự_nghiệp nơi đanh ghi bàn đứng thứ_tư danh_sách vua_phá_lưới mùa vừa_qua tiền_vệ ghi_bàn cao lịch_sử mls chân chuyền đứng thứ_hai new_england năm pha kiến_tạo thành_công lee_nguyễn hoàn_toàn xứng_đáng lần đầu_tiên được lọt vào đội_hình tiêu_biểu mùa pha lập_công kiến_tạo lối chơi sáng_tạo ổn_định lee_nguyễn góp_phần quan_trọng làm_nên mùa giải thành_công rực_rỡ new_england_revolution họ nhì mls miền đông khi đăng_quang mls cup khu_vực đồng_nghĩa một suất vào chung_kết mls cup toàn_quốc nhờ lọt vào danh_sách rút_gọn cuối_cùng cho đua cầu_thủ giá_trị mvp robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bàn thắng gỡ hòa 1-1 vào lưới houston_dynamo tuần ngôi_sao sinh năm lọt danh_sách bốn bàn thắng đẹp mls sau bảy năm được gọi trở_lại đội_tuyển mỹ đội_hình tiêu_biểu mùa vừa_qua los_angles_galaxy đóng_góp nhiều ba cá_nhân chia đều hàng thủ đến hàng công đội bóng đối_thủ cạnh_tranh vô_địch mls cup lee_nguyễn revolution sân stubhub_center california ngày tới đội_hình tiêu_biểu mls mùa thủ_môn bill_hamid dc united hậu_vệ bobby_boswell dc united omar_gonzalez los_angeles_galaxy chad_marshall seattle_sounders tiền_vệ landon_donovan los_angeles_galaxy thierry_henry new_york_red_bulls lee_nguyễn new_england_revolution diego_valeri portland_timbers tiền_đạo robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders fc bradley_wright phillips new_york_red_bulls đông_anh'
 s3 = 'thành_lương đỏ làm_nên tuyệt_phẩm trận đấu cuối_cùng bảng philippines ảnh giang_huy malaysia tập_trung hôm_qua để chuẩn_bị cho trận đấu tuyển việt_nam ngày sân_nhà shah_alam sau khi lách khe cửa hẹp để giành vị_trí thứ_hai bảng tay đội singapore thầy_trò salleh háo_hức muốn được kết_quả thật tốt một lời xin_lỗi để cđv nhà thất_vọng thời_gian gì phát_biểu có_thể thấy salleh nghiên_cứu kỹ báo_cáo hlv_u2 ong_kim_swee người được liên_đoàn bóng_đá malaysia fam cử sang hà_nội theo_dõi đối_thủ bảng trọng_tâm tuyển việt_nam đá giao_hữu tuyển việt_nam giải đấu nên phần_nào biết làm gì để kiềm_chế sức_mạnh họ salleh tiết_lộ báo_giới malaysia chúng tô đặc_biệt cẩn_trọng số nguyễn_văn_quyết số phạm_thành_lương cầu_thủ nguy_hiểm ong_kim_swee cho biết như_thế cầu_thủ văn_quyết đỏ chưa ghi_bàn được đối_thủ đánh_giá cao lối chơi ảnh giang_huy cá_nhân ong_kim_swee đưa nhận_xét tuyển việt_nam sau một thời_gian do_thám đội bóng xây_dựng được một phong_cách hoàn_toàn khác_biệt thời hlv người nhật_bản_toshiya_miura họ cầm bóng tốt không_bao_giờ chuyền bóng ngược sau luôn hướng lên phía miura sở_hữu cầu_thủ kỹ_thuật cá_nhân tốt malaysia cảnh_giác mỗi khi đối_phương bóng sát vòng cấm_địa việt_nam ghi hai bàn vào lưới philippines cú sút xa khi được hỏi điểm yếu tuyển việt_nam ong_kim_swee người giúp u23 malaysia vô_địch sea games tỏ bí_hiểm gì thấy một tập_thể gắn_kết mỗi vị_trí đều điểm yếu họ để thủng lưới ba lần điểm yếu có_thể tận_dụng khai_thác hlv salleh đen âm_thầm chuẩn_bị kế_hoạch gây bất_ngờ tuyển việt_nam sân_nhà ảnh ts bên_cạnh việc tìm cách phong_tỏa hai ngòi_nổ tuyển việt_nam salleh cố_gắng giải_quyết khoảng_trống shukor_adan mohd_amri_yahya để hai cầu_thủ trụ_cột đều vắng_mặt trận lượt_đi án treo_giò indra_putra_mahyuddin kunanlan manaf_mamat đều có_thể được tung vào sân_sau khi minh_chứng được khả_năng buổi tập safiq_rahim mohd_muslim có_thể đá vị_trí tiền_vệ trụ thay_thế cho shukor_adan salleh tiết_lộ ít_nhiều khung đội_hình thi_đấu cuối tuần người thay_thế amri_yahya trận đấu kulanan hoặc manaf_mamat tuấn'
 corpus = [s1, s2, s3]
 
 
 print 'DOne'
Example #47
0
"""
Created on Sat Oct  6 22:00:19 2012
"""
from common import *
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import dok_matrix, csr_matrix, lil_matrix
import cPickle

import nltk

TRAIN_FILE = DATA_PATH + "train.csv"
TEST_FILE = DATA_PATH + "public_leaderboard.csv"
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

vectorizer = CountVectorizer(stop_words="english")
tok = vectorizer.build_tokenizer()

def exclude_code(body_text):
    return '\n'.join([line for line in body_text.split('\n') if not line.startswith("    ")])

once_set = set()
words_set = set()

print "building words set"
for i, question in enumerate(get_reader(TRAIN_FILE)):
    title = question[6]
    body = exclude_code(question[7])
    #tags = [question[i] for i in [8,9,10,11,12] if type(question[i]) is str]
    words = tok('. '.join([title, body] ).lower())
    for word in (w for w in words if w not in STOP_WORDS):
        if word in once_set:
Example #48
0
#print len(test_data)
#test_data = df1.iloc[:,1]
vctr =  CountVectorizer(stop_words='english',min_df = 1)
vctr2 = HashingVectorizer(stop_words='english')
vctr1 = TfidfVectorizer(stop_words='english')
count_pos = 0
count_neg = 0

######################################################################################################
train = []
test = []
for i in range(len(train_data)):
    string = train_data[i,0]
    #print string,i
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    train.append(' '.join(string))

for i in range(len(test_data)):
    string = test_data[i,0]
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    test.append(' '.join(string)) 
#print "len of the normalized test data obtained"    
#print len(test)  
######################################################################################################
train_data = vctr.fit_transform(train).toarray()
#print vctr1.inverse_transform(train_data)
y_train = np.asarray(label_train, dtype="|S6")
clf1 =   GradientBoostingClassifier(n_estimators = 660)
clf2 =   AdaBoostClassifier(n_estimators = 660)
Example #49
0
def main():
  parser = argparse.ArgumentParser(description='Visualize some stuff')
  parser.add_argument('-json', '-j', type=str, help='generate json file')
  parser.add_argument('-loadjson', '-l', type=str, help='load json file')
  parser.add_argument('-dataset', '-d', type=str, help='2ng for Christianity vs Atheism, 3ng for Windows misc, IBM hardward and Windows X,', default='2ng')
  parser.add_argument('-classifier', '-c', type=str, help='logistic for logistic regression, svm for svm', default='logistic')
  args = parser.parse_args()
  global train_vectors, train_labels, test_vectors, test_labels, classifier, tokenizer, parsed_train, parsed_test, json_map, class_names, inverse_vocabulary
  train_data, train_labels, test_data, test_labels, class_names = LoadDataset(args.dataset)
  dataset_json = {'2ng' : '2ng.json', '3ng':'3ng.json', 'r8': 'r8.json', 'r52':'r52.json', 'webkb' : 'webkb.json'}
  vectorizer = CountVectorizer(lowercase=False)
  if args.classifier == 'logistic':
    classifier = linear_model.LogisticRegression(fit_intercept=True)
  elif args.classifier == 'svm':
    classifier = svm.SVC(probability=True)
  else:
    print 'ERROR: classifier must be logistic'
    quit()
  print 'Starting... done'
  train_vectors = vectorizer.fit_transform(train_data)
  test_vectors = vectorizer.transform(test_data)
  print 'Fitting classifier...',
  classifier.fit(train_vectors, train_labels)
  print 'done'
  terms = np.array(list(vectorizer.vocabulary_.keys()))
  indices = np.array(list(vectorizer.vocabulary_.values()))
  inverse_vocabulary = terms[np.argsort(indices)]
  tokenizer = vectorizer.build_tokenizer()
  print 'parsing train, test...',
  parsed_train = GetParsedDocuments(train_data, tokenizer)
  parsed_test = GetParsedDocuments(test_data, tokenizer)
  print 'done'
  print 'generating (or loading) json...',
  json_map = {}
  if args.loadjson:
    json_map = json.load(open(args.loadjson))
  else:  
    json_map = GenerateJSON(class_names, train_data, train_vectors, train_labels, test_data, test_vectors, test_labels, classifier, vectorizer)
  print 'done'
  if args.json:
    json.dump(json_map, open(args.json, 'w'))
  else:
    @route('/get_json', method=['OPTIONS', 'POST', 'GET'])
    @enable_cors
    def json_fun():
      global json_map
      return json_map
    @route('/predict', method=['OPTIONS', 'POST', 'GET'])
    @enable_cors
    def predict_fun():
        global train_vectors, train_labels, test_vectors, test_labels, classifier, tokenizer, parsed_train, parsed_test, json_map, class_names, inverse_vocabulary
        #print request.json
        ret = {}
        ex = ''
        if request.json['features']:
          ex = ' '.join(request.json['features'])
        sentence_explanation = request.json['sentence_explanation']
        v = vectorizer.transform([ex])
        #print 'Example:', ex
        #print 'Pred:'
        #print classifier.predict_proba(v)[0]
        ret['predict_proba'] = ListifyVector(classifier.predict_proba(v)[0])
        print ret['predict_proba']
        ret['prediction'] = classifier.predict(v)[0]
        #ret['feature_weights'] = WordImportance(classifier, v, inverse_vocabulary)
        if sentence_explanation:
          ret['feature_weights'] = WordImportanceSentenceGreedy(classifier, ex, vectorizer, inverse_vocabulary)
        else:
          ret['feature_weights'] = WordImportanceGreedy(classifier, ex, vectorizer, inverse_vocabulary)
        make_map = lambda x:{'feature':x[0], 'weight' : x[1]['weight'], 'class': x[1]['class']}
        ret['sorted_weights'] = map(make_map, sorted(ret['feature_weights'].iteritems(), key=lambda x:x[1]['weight'], reverse=True))
        return ret
    @route('/regex', method=['OPTIONS', 'POST', 'GET'])
    @enable_cors
    def regex_fun():
        global train_vectors, train_labels, test_vectors, test_labels, classifier, tokenizer, parsed_train, parsed_test, json_map, class_names, inverse_vocabulary
        ret = {}
        ex = ''
        print request.json
        if request.json['regex']:
          regex = re.sub(r'\\\\', r'\\', request.json['regex'])
        reg = re.compile(regex, re.DOTALL | re.MULTILINE)
        ret['train'] = {}
        for i, doc in enumerate(parsed_train):
          print i
          iterator = reg.finditer(doc)
          for m in iterator:
            if i not in ret['train']:
              ret['train'][i] = []
            ret['train'][i].append(m.span())
        ret['test'] = {}
        for i, doc in enumerate(parsed_test):
          print i
          iterator = reg.finditer(doc)
          for m in iterator:
            if i not in ret['test']:
              ret['test'][i] = []
            ret['test'][i].append(m.span())
        print 'Regex', regex
        return ret
    @route('/run_regex', method=['OPTIONS', 'POST', 'GET'])
    @enable_cors
    def regex_run():
        global train_vectors, train_labels, test_vectors, test_labels, classifier, tokenizer, parsed_train, parsed_test, json_map, class_names, inverse_vocabulary
        ex = ''
        print request.json
        regex_terms = set()
        if request.json['regex']:
          regexes = [(re.compile(re.sub(r'\\\\', r'\\', x.split('/')[1]), re.DOTALL | re.MULTILINE), x.split('/')[2]) for x in request.json['regex']]
        else:
          return json_map
        temp = []
        print 'Applying to train'
        for doc in parsed_train:
          d = doc
          for reg in regexes:
            instances = reg[0].findall(d)
            for instance in instances:
              map(lambda x: regex_terms.add(x), tokenizer(instance))
            d = re.sub(reg[0], reg[1], d)
          temp.append(d.strip(' '))
        parsed_train = temp
        temp = []
        print 'Applying to test'
        for doc in parsed_test:
          d = doc
          for reg in regexes:
            instances = reg[0].findall(d)
            for instance in instances:
              map(lambda x: regex_terms.add(x), tokenizer(instance))
            d = re.sub(reg[0], reg[1],d)
          temp.append(d.strip(' '))
        parsed_test = temp
        if len(regex_terms) > 100:
          regex_terms = set(vectorizer.vocabulary_.keys())
        # TODO: this could be much more efficient if I use a trie
        else:
          to_add = set()
          for w, i in vectorizer.vocabulary_.iteritems():
            for z in regex_terms:
              if w.startswith(z) or w.endswith(z):
                to_add.add(w)
          regex_terms = regex_terms.union(to_add)

        train_vectors = vectorizer.fit_transform(parsed_train)
        test_vectors = vectorizer.transform(parsed_test)
        classifier.fit(train_vectors, train_labels)
        terms = np.array(list(vectorizer.vocabulary_.keys()))
        indices = np.array(list(vectorizer.vocabulary_.values()))
        inverse_vocabulary = terms[np.argsort(indices)]
        # TODO: redoing some work here
        tokenizer = vectorizer.build_tokenizer()
        if request.json['regex']:
          for r in request.json['regex']:
            map(lambda x: regex_terms.add(x), tokenizer(r.split('/')[2]))
        print regex_terms
        print 'Updating Json'
        UpdateJSON(json_map, regex_terms, class_names, parsed_train, train_vectors, train_labels, parsed_test, test_vectors, test_labels, classifier, vectorizer)
        print 'Returning'
        return json_map
    @route('/')
    def root_fun():
        return template('template')
    @route('/<filename>')
    def server_static(filename):
        return static_file(filename, root='./static/')
    run(host='localhost', port=8870, debug=True)
Example #50
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=True, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())


    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10 # max(10, args.fixk)

    # if args.fixk < 0:
    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    labels, sent_train = split_data_sentences(data.train, sent_detector)

    data.train.data = sent_train
    data.train.target = np.array(labels)

    labels, sent_train = split_data_sentences(data.test, sent_detector)
    data.test.data = sent_train
    data.test.target = np.array(labels)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)


    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)

    student = structured.AALStructured(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct,
                                       subpool=250, cost_model=cost_model)
    student.set_score_model(exp_clf)



    print "\nStudent Classifier: %s" % clf


    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print ("Sentence Classification")
    t0 = time.time()

    # predition = exp_clf.predict(data.train.bow)

    predictions = exp_clf.predict_proba(data.train.bow)
    unc = np.min(predictions, axis=1)

    coef = exp_clf.coef_[0]
    dc = diags(coef, 0)
    # ind = np.argsort(coef)
    fn = np.array(vct.get_feature_names())
    # print fn[ind[:10]]
    # print fn[ind[-10:]]
    print "Features:%s " % len(fn)
    auc = metrics.roc_auc_score(data.train.target, predictions[:, 1])

    pred_y = exp_clf.classes_[np.argmax(predictions, axis=1)]

    accu = metrics.accuracy_score(data.train.target, pred_y)

    most_post = np.argsort(predictions[:, 0])

    print()
    print "\n".join('%s/%.2f' % (fn[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0)
    print"*"*80
    print("AUC:{}, Accu:{}".format(auc, accu))
    print ("Size of predictions {} - {}".format(most_post.shape[0], predictions.shape[0]))



    # print "*"*50
    # print "Positive"
    # for d in most_post[:10]:
    #     print d,
    #     print predictions[d],
    #     print data.train.target[d],
    #     print data.train.data[d]
    #
    #     mm = data.train.bow[d] * dc  # sentences feature vectors \times diagonal of coeficients. sentences by features
    #     print "\n".join("%.3f / %s" % (v, n) for v, f, n in zip(mm.A[0], data.train.bow[d].A[0,:], fn) if f > 0)
    #     print "-"*20
    #
    # print "*"*50
    # print "Negative"
    # for d in most_post[-10:]:
    #     print d,
    #     print predictions[d],
    #     print data.train.target[d],
    #     print data.train.data[d]
    #     mm = data.train.bow[d] * dc  # sentences feature vectors \times diagonal of coeficients. sentences by features
    #     # print mm[mm > 0]
    #
    #     print "\n".join("%.3f / %s" % (v, n) for v, f, n in zip(mm.A[0], data.train.bow[d].A[0,:], fn) if f > 0)
    #     print "-"*20
    #
    # print "*"*50
    # print "Middle"
    # m = len(most_post) / 2
    # for d in most_post[m-50:m+50]:
    #     print d,
    #     print predictions[d],
    #     print data.train.target[d],
    #     print data.train.data[d]
    #     mm = data.train.bow[d] * dc  # sentences feature vectors \times diagonal of coeficients. sentences by features
    #     # print mm[mm > 0]
    #
    #     print "\n".join("%.3f / %s" % (v, n) for v, f, n in zip(mm.A[0], data.train.bow[d].A[0,:], fn) if f > 0)
    #     print "-"*20



    print("Elapsed time %.3f" % (time.time() - t0))
Example #51
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        print "Loading existing file... %s " % args.train
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "rb")
        vct = pickle.load(vectorizer)
        vectorizer.close()
    except (IOError, ValueError):
        print "Loading from scratch..."
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "wb")
        pickle.dump(vct, vectorizer)
        vectorizer.close()

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if args.student in "anyunc":
            student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        elif args.student in "lambda":
            student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
        elif args.student in "anyzero":
            student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        else:
            raise ValueError("Oops! We do not know that anytime strategy. Try again.")

        print "\nStudent: %s " % student
        train_indices = []
        neutral_text = []  # save the raw text of the queries
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = [] # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.text = data.train.data
        # pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []
            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                # print "pick instance"

                ## chose returns: index, k
                ## util returns: utility, k, unc
                query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
                query_index = [a for a, b in query_chosen]
                query_size = [b for a, b in query_chosen]

                # query = pool.fixk[query_index]  # query with k words
                qk = []
                for q, k in query_chosen:
                    qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)]))
                query = vct.transform(qk)

            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost
            # print query_index
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            # print labels
            # print "label\tutility\tk\tunc"
            # print format_query(zip(labels, util))

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                # print "get training"
                # train_indices.extend(query_index)
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                #train_y = pool.target[train_indices]
                train_y.extend(useful_answers[:, 1])

            if neutral_answers.shape[0] != 0:
                # current query neutrals
                qlbl = []

                for xik, lbl in zip(query, labels):
                    # neutral_data.append(xik)
                    if isinstance(neutral_data, list):
                        neutral_data = xik
                    else:
                        neutral_data = vstack([neutral_data, xik], format='csr')
                    qlbl.append(neutral_label(lbl))

                ## append the labels of the current query
                neu_y = np.append(neu_y, qlbl)
                neu_x = neutral_data
                #end usefulanswers


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            # current_model = student.train(train_x, train_y)
            # print "train models"
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)
            # print "evaluate"
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                format_spent(spent),
                len(neutral_answers), neu_y.shape[0]))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)