Esempio n. 1
0
def get_topic_term_tfidf(topic_texts, min_df=1):
    vector = TfidfVectorizer(ngram_range=(1, 1),
                             stop_words='english',
                             min_df=min_df)
    vector.build_analyzer()
    tfidf = vector.fit_transform(topic_texts)
    return tfidf.toarray().sum(axis=0), vector
Esempio n. 2
0
def tfidf(wordlist):
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf_dict = {}
    tfidf = TfidfVectorizer(stop_words='english',
                            analyzer='word',
                            ngram_range=(1, 2))
    tfidf.build_analyzer()
    response = tfidf.fit_transform(wordlist)
    feature_names = tfidf.get_feature_names()
    for col in response.nonzero()[1]:
        tfidf_dict[feature_names[col]] = response[0, col]

    return tfidf_dict
Esempio n. 3
0
 def __init__(self, n_features, voc_file):
     self.n_features = n_features
     self.voc_file = voc_file
     self.word_clusters, self.grouped_words = self.read_word_cluster(
         voc_file)
     tfidf = TfidfVectorizer(encoding='iso-8859-1', stop_words='english')
     self.vectorize = tfidf.build_analyzer()
def get_vocab(texts, rate):
    vectorizer = TfidfVectorizer(min_df=5, stop_words='english')
    features = vectorizer.fit_transform(texts).tocsc()

    vocab = vectorizer.get_feature_names()
    analyzer = vectorizer.build_analyzer()

    df = 1. / np.exp(vectorizer.idf_ - 1) * (len(texts) + 1) - 1

    word_value_list = []
    for i, word in enumerate(vocab):
        assert len(features[:, i].data) == int(round(df[i]))
        word_value_list.append(
            [word,
             np.mean(features[:, i].data),
             len(features[:, i].data)])
    word_value_list.sort(key=lambda t: t[1], reverse=True)

    total = sum([len(analyzer(text)) for text in texts])
    word_counter = {word: 0 for word in vocab}
    for text in texts:
        for word in analyzer(text):
            if word in word_counter:
                word_counter[word] += 1

    cnt = 0
    result_list = []
    for i, (word, _, df) in enumerate(word_value_list):
        result_list.append(word)
        cnt += word_counter[word]
        if cnt / total > rate:
            print(f'{i+1} words take {cnt / total} content.')
            break

    return result_list, analyzer
Esempio n. 5
0
def consider_glove():
    textual = TfidfVectorizer()
    tokenizer = textual.build_analyzer()

    def prepare(df: pd.DataFrame,
                fit: bool = False) -> Tuple[np.ndarray, np.ndarray]:
        y = np.array(df.label.values)
        N = len(y)
        D = len(glove["the"])
        X = np.zeros((N, D))
        for i, example in enumerate(df.text):
            count = 0
            for word in tokenizer(example):
                if word in glove:
                    count += 1
                    X[i] += glove[word]
            if count > 0:
                X[i] /= count
        return (y, X)

    y_train, X_train = prepare(train_f, fit=True)
    y_vali, X_vali = prepare(vali_f)
    y_test, X_test = prepare(test_f)

    m = SGDClassifier()
    m.fit(X_train, y_train)
    print("glove-Train-Acc: {:.3}".format(m.score(X_train, y_train)))
    print("glove-Vali-Acc: {:.3}".format(m.score(X_vali, y_vali)))
def analyze_corpus(images):
    "Preprocess the corpus."
    vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')
    documents = [' '.join(descriptions) for descriptions in images]
    vectorizer.fit(documents)
    analyzer = vectorizer.build_analyzer()
    return vectorizer, analyzer
Esempio n. 7
0
def ida(articles):
    stopwords = []
    doc_terms = []
    with open('ch_stopwords.txt', 'r') as f:
        stopwords = set(f.read().lower().split('\n'))

    #print('stopwords', stopwords[:10])

    trigram_vectorizer = TfidfVectorizer(
        ngram_range=(2, 3),
        token_pattern=r'([\u4e00-\u9fa5]{1}|)',
        min_df=10,
        max_df=20,
        stop_words=stopwords,
        analyzer='word')
    analyzer = trigram_vectorizer.build_analyzer()
    '''
	for article in articles:
		terms = map(lambda x: x.replace(' ', ''), analyzer(article.Content)) #get rid of spaces
		#terms = set(map(lambda x: x.replace(' ', ''), analyzer(article.Content))) #get rid of spaces
		#terms = list(terms-stopwords)
		doc_terms.append(list(terms))
	'''
    article_contents = map(lambda x: x.Content, articles)
    doc_terms = trigram_vectorizer.fit_transform(article_contents)
    tf_feature_names = trigram_vectorizer.get_feature_names()
    print(len(tf_feature_names), tf_feature_names[100:200])

    vocab = trigram_vectorizer.vocabulary_

    joblib.dump(vocab, 'lda-vocab.pkl', compress=1)
    #print_top_words(lda, tf_feature_names, 10)

    #print(doc_terms.get_feature_names())
    '''
    def __init__(self,
                 filename,
                 doc_text_header=None,
                 doc_id_header=None,
                 num_phrases=10):
        # init basic variables
        self.time = time.time()
        self.num_phrases = num_phrases
        self.filename_full = filename.split('.')[0]
        self.filename = os.path.basename(filename).split('.')[0]
        self.time_filename = '{}_streaming_time.txt'.format(self.filename)
        self.ngrams = (3, 5)
        self.index_to_docid = Counter()
        self.docid_to_index = Counter()

        self.data = pandas.read_csv(filename, lineterminator='\n')
        self.determine_header_names(doc_text_header, doc_id_header)

        self.num_ads = len(self.data.index)
        self.cluster_graph = nx.Graph()

        # setup tfidf - we want to keep emojis and capitalization
        tfidf = TfidfVectorizer(token_pattern=r'[^\s]+',
                                lowercase=False,
                                ngram_range=self.ngrams,
                                sublinear_tf=True)
        self.tokenizer = tfidf.build_analyzer()
        self.data[self.description] = self.data.apply(lambda r: filter_text(
            '{} {}'.format(r['title'], r[self.description])),
                                                      axis=1)
        self.tfidfs = tfidf.fit_transform(self.data[self.description])
        self.tfidf_indices = tfidf.get_feature_names()
        print('done with tfidf', time.time() - self.time)
Esempio n. 9
0
	def build_analyzer(self):
		analyzer = TfidfVectorizer.build_analyzer(self)
		NoPunctuation = lambda q: ''.join([x for x in q if x not in punctuation])

		def innerFx(sentence):
			sentence = NoPunctuation(sentence)
			return analyzer(sentence)
		return innerFx
Esempio n. 10
0
def tfidf_vector():
    # 特征提取method3
    tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    tfidf_train_2 = tv.fit_transform(newsgroup_train.data)
    tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
    tfidf_test_2 = tv2.fit_transform(newsgroups_test.data)
    print("the shape of train is " + repr(tfidf_train_2.shape))
    print("the shape of test is " + repr(tfidf_test_2.shape))
    analyze = tv.build_analyzer()
    tv.get_feature_names()
    return tfidf_train_2, tfidf_test_2
Esempio n. 11
0
def learn_vocabulary(docs, only_noun_phrases=True):
    first_occurrence_all = []
    entropy_all = []
    #docs = [doc.decode('utf8', 'ignore') for doc in docs]
    '''
    noun_phrases = set()
    if only_noun_phrases:
        for i, doc in enumerate(docs):
            print "--extracting NP from doc", i
            #doc = doc.decode('utf8', 'ignore')
            noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    with open('./semeval_train_docs_noun_phrases.set', 'w') as f:
        pickle.dump(noun_phrases, f)
    '''

    print "loading pre-extracted set of noun_phrases"
    noun_phrases = set()
    with open('./semeval_train_docs_noun_phrases.set', 'r') as f:
        noun_phrases = pickle.load(f)

    vectorizer = TfidfVectorizer(decode_error='ignore',
                                 preprocessor=preprocess,
                                 ngram_range=(1, 3),
                                 tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    vocab = set()
    print "--learning vocabulary"
    for i, doc in enumerate(docs):
        print "--learning doc", i
        first_occurrence = {}
        entropy = {}

        phrases = analyzer(doc)  # all phrases from doc
        doc = preprocess(doc)
        doc_length = len(doc)
        chunks = get_chunks(doc)
        for i, phrase in enumerate(phrases):
            if valid_ngram(phrase,
                           noun_phrases) and phrase not in first_occurrence:
                try:
                    pos = doc.find(phrase)
                except ValueError:
                    print "--phrase: '{}' not found".format(phrase)
                    continue
                first_occurrence[phrase] = pos / doc_length
                # calculate entropy
                entropy[phrase] = get_entropy(phrase, chunks)
                vocab.add(phrase)
        first_occurrence_all.append(first_occurrence)
        entropy_all.append(entropy)
    print "--size of vocabulary: ", len(vocab)
    return vocab, first_occurrence_all, entropy_all
def method3(newsgroup_train,newsgroups_test):
    print('*************************\nTfidfVectorizer\n*************************')
    from sklearn.feature_extraction.text import TfidfVectorizer
    tv = TfidfVectorizer(sublinear_tf=True,
                         max_df=0.5,
                         stop_words='english')
    tfidf_train_2 = tv.fit_transform(newsgroup_train.data)
    tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
    tfidf_test_2 = tv2.fit_transform(newsgroups_test.data)
    print("the shape of train is " + repr(tfidf_train_2.shape))
    print("the shape of test is " + repr(tfidf_test_2.shape))
    analyze = tv.build_analyzer()
    tv.get_feature_names()  # statistical features/terms
Esempio n. 13
0
class GraphsizePretrained(BaseEstimator, TransformerMixin):
    def __init__(self, w=2, pretrained_vec='glove.6B.100d', verbose=False):
        super(GraphsizePretrained, self).__init__()
        self.w = w
        self.pretrained_vec = pretrained_vec
        self.embeddings_dict = {}
        
        if not verbose:
            self.progress_bar = lambda x: x
        else:
            from tqdm import tqdm
            self.progress_bar = tqdm
            
        with open(self.pretrained_vec, 'r') as f:
            for line in self.progress_bar(f):
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                self.ndim = len(vector)
                self.embeddings_dict[word] = vector
        self.vocab = { word: i for (i,word) in enumerate( self.embeddings_dict.keys() ) }
        
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor)
        
    def fit(self, X, y=None):
        self.N = len(X)
        return self
   
    def transform(self, text):
        docs = list(map(self.analyzer.build_analyzer(), self.progress_bar(text)))
        result = list(map(self._build_graph_, self.progress_bar(docs)))
        return result
    
    def _build_graph_(self, doc):
        terms    = list(filter( lambda x: x in self.embeddings_dict, doc))
        sorted_terms = sorted(list(set(terms)))

        cooccur_count = Counter()
        for i,idt in enumerate(terms):
            terms_to_add = terms[ max(i-self.w, 0):i ]
            terms_to_add = list(zip(terms_to_add, repeat(idt)))
            terms_to_add = list(map(sorted,terms_to_add))
            terms_to_add = list(map(tuple,terms_to_add))
            cooccur_count.update( terms_to_add )
        
        G = nx.Graph()
        G.add_nodes_from( sorted_terms )
        w_edges = [ (s,t,w) for ((s,t),w) in cooccur_count.items() ]
        G.add_weighted_edges_from( w_edges, weight='freq' )
        
        return G, np.array([ self.embeddings_dict[term] for term in sorted_terms ])
Esempio n. 14
0
def vectorize(u_plus_v, batch_size, list_documents, labels, args):
    #print(i)

    #d, y = args
    #print(args)
    start_index = args

    vectorizer = TfidfVectorizer()
    analyze = vectorizer.build_analyzer()

    start = datetime.datetime.now()
    for d, y in zip(
            list_documents[batch_size * start_index:batch_size *
                           (start_index + 1)],
            labels[batch_size * start_index:batch_size * (start_index + 1)]):

        document = []
        i = 0
        for w in analyze(d):

            try:
                if u_plus_v:
                    glove = (global_V['u'][global_D[w]] +
                             global_V['v'][global_D[w]]) / 2
                else:
                    glove = global_V['u'][global_D[w]]
                document.append(glove)
            except KeyError as e:
                i = i + 1

            #if i>0:
            #    print("missing words " + str(i) + " / " + str(len(d)))

            #if len(mean_d == 0):
            #    raise Exception("Empty mean_d")

        if len(document) > 0:
            mean_d = np.mean(document, axis=0)
            #if X is None:
            #    X = mean_d
            #else:
            #    #pdb.set_trace()
            #    X = np.vstack([X, mean_d])

            #with lock:
            global_X.append(mean_d)
            global_Y.append(y)

    end = datetime.datetime.now()
    delta = end - start
    print("process in " + str(delta.total_seconds()) + "s")
Esempio n. 15
0
class Analyzer(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
        self.tokens = self.tfidf.build_tokenizer()
        self.ngram = self.tfidf.build_analyzer()

    def __call__(self, sentence):
        ret = self.ngram(sentence)
        terms = self.tokens(sentence)
        for term in terms:
            cate = term_category(term)
            if term != cate:
                ret.append(cate)
        return ret
def learn_vocabulary(docs, only_noun_phrases=True):
    first_occurrence_all = []
    entropy_all = []
    #docs = [doc.decode('utf8', 'ignore') for doc in docs]

    '''
    noun_phrases = set()
    if only_noun_phrases:
        for i, doc in enumerate(docs):
            print "--extracting NP from doc", i
            #doc = doc.decode('utf8', 'ignore')
            noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    with open('./semeval_train_docs_noun_phrases.set', 'w') as f:
        pickle.dump(noun_phrases, f)
    '''

    print "loading pre-extracted set of noun_phrases"
    noun_phrases = set()
    with open('./semeval_train_docs_noun_phrases.set', 'r') as f:
        noun_phrases = pickle.load(f)

    vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    vocab = set()
    print "--learning vocabulary"
    for i, doc in enumerate(docs):
        print "--learning doc", i
        first_occurrence = {}
        entropy = {}

        phrases = analyzer(doc) # all phrases from doc
        doc = preprocess(doc)
        doc_length = len(doc)
        chunks = get_chunks(doc)
        for i, phrase in enumerate(phrases):
            if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence:
                try:
                    pos = doc.find(phrase)
                except ValueError:
                    print "--phrase: '{}' not found".format(phrase)
                    continue
                first_occurrence[phrase] = pos / doc_length
                # calculate entropy
                entropy[phrase] = get_entropy(phrase, chunks)
                vocab.add(phrase)
        first_occurrence_all.append(first_occurrence)
        entropy_all.append(entropy)
    print "--size of vocabulary: ", len(vocab)
    return vocab, first_occurrence_all, entropy_all
Esempio n. 17
0
def feed(param):
    values=[]
    result={}
    tweetdata = rawtweets.find()
    json_str =json_util.dumps(tweetdata)
    tweetdata =json_util.loads(json_str)
    path = os.path.dirname(os.path.realpath(__file__))
    texts = []
    for tweetlist in tweetdata:
        tweet = tweetlist["text"]
        print(tweet)
        #d = datetime.strptime(tweetlist["_id"], '%Y/%m/%d/%H')
        text = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore').decode('utf-8')
        texts.append(text)
    vectorizer = TfidfVectorizer(
        analyzer='char',
        #token_pattern=r'[a-z]{4,}',
        #use_idf=True,
        #strip_accents='unicode',
        #sublinear_tf=False
        )
    print(len(texts))
    vectorizer.build_analyzer()
    idf = vectorizer.fit_transform(texts)
    feature_names = np.asarray(vectorizer.get_feature_names())
    #print(idf.todense().T)
    #print((idf * idf.T).A)
    #print(idf.data)
    print("len ",(feature_names))
    z = (zip(feature_names,idf.data))
    
    d = {}
    for t in z:
        #print(t[0],t[1])
        d[t[0]] = t[1] 
    #print(d)
    return d
Esempio n. 18
0
def create_analyser(data, col, type_ngrams='words'):
    if type_ngrams == 'words':
        k1 = 1
        k2 = 1
    elif type_ngrams == 'N_grams':
        k1 = 1
        k2 = 3
    elif type_ngrams == 'Only_N_grams':
        k1 = 2
        k2 = 3
    vectorizer = TfidfVectorizer(ngram_range=(k1, k2),
                                 lowercase=False,
                                 stop_words=None)
    vectorizer.fit(list(data[col]))
    analyser = vectorizer.build_analyzer()

    return analyser
Esempio n. 19
0
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size=450):

    #vocab = set(phrase_list)
    idf_dic = {}
    #print "phrase list len", len(phrase_list)
    #print "len idf_vec", len(idf_vec)
    for i, phrase in enumerate(phrase_list):
        idf_dic[phrase] = idf_vec[i]
    noun_phrases = set()
    print "--extracting NP"
    noun_phrases = set(
        [lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    vectorizer = TfidfVectorizer(decode_error='ignore',
                                 preprocessor=preprocess,
                                 ngram_range=(1, 3),
                                 tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    phrases = list(
        set([
            phrase for phrase in analyzer(doc)
            if valid_ngram(phrase, noun_phrases)
        ]))
    doc = preprocess(doc)

    #print "candidate phrases", phrases
    #tfidf = []
    #first_occurrence = []
    #entropy = []
    #length = []
    doc_len = len(doc)

    entropy = get_entropy_doc(doc, phrases)
    # get feature vectors
    features = []
    for i, phrase in enumerate(phrases):
        first_occurrence = doc.find(phrase) / doc_len
        tf = doc.count(phrase)
        if phrase in idf_dic:
            tfidf = tf * idf_dic[phrase]
        else:
            tfidf = tf * log10(training_size)
        feature_vec = get_feature_vector(phrase, tfidf, first_occurrence,
                                         entropy[i])
        features.append(feature_vec)
    return phrases, features
Esempio n. 20
0
def tfidf_vectorize(train_words, test_words):
    #method 2:TfidfVectorizer
    print(
        '*************************\nTfidfVectorizer\n*************************'
    )
    from sklearn.feature_extraction.text import TfidfVectorizer
    tv = TfidfVectorizer(sublinear_tf=True)  # ,  max_df = 0.5

    tfidf_train_2 = tv.fit_transform(train_words)
    #得到矩阵
    tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
    tfidf_test_2 = tv2.fit_transform(test_words)
    print("the shape of train is " + repr(tfidf_train_2.shape))
    print("the shape of test is " + repr(tfidf_test_2.shape))
    analyze = tv.build_analyzer()
    tv.get_feature_names()  #statistical features/terms
    return tfidf_train_2, tfidf_test_2
Esempio n. 21
0
 def find_tfidf(self):
     ''' pre-calculate tfidf '''
     print('Finding tfidf...')
     stop_words = set(stopwords.words('english')).update(
         set(stopwords.words('italian')))
     vectorizer = TfidfVectorizer(lowercase=True,
                                  ngram_range=self.ngrams,
                                  norm='l2',
                                  smooth_idf=True,
                                  stop_words=stop_words,
                                  min_df=2,
                                  max_df=0.8)
     self.data[self.description] = self.data[self.description].apply(
         self.filter_text)
     self.tfidf = vectorizer.fit_transform(self.data[self.description])
     self.tfidf_indices = vectorizer.get_feature_names()
     self.tokenizer = vectorizer.build_analyzer()
def create_char_vectorizer(sentences):
    #Create TF-IDF object
    tfidf_char_vectorizer = TfidfVectorizer(analyzer='char_wb',
                                            max_df=0.90,
                                            max_features=200000,
                                            min_df=0.05,
                                            use_idf=True,
                                            ngram_range=(1, 3))
    tfidf_char_vectorizer = tfidf_char_vectorizer.fit(sentences)
    tfidf_matrix = tfidf_char_vectorizer.transform(sentences)
    print(tfidf_matrix)
    dense_matrix = tfidf_matrix.todense()
    print(dense_matrix)
    print(tfidf_char_vectorizer.get_feature_names())
    analyze = tfidf_char_vectorizer.build_analyzer()
    print(analyze("To Sherlock Holmes she is always _the_ woman."))
    return (tfidf_char_vectorizer, tfidf_matrix)
Esempio n. 23
0
def generate_sentences():
    print('Generating Clause Set')
    tf = TfidfVectorizer(token_pattern=r'(?u)\b[a-zA-Z]{2,}\b', max_df=1)
    analyser = tf.build_analyzer()
    all_sections = session.query(Section).filter(Section.source_id.isnot(None))
    docs = []

    for s in all_sections:
        for c in s.clauses:
            if c.cleaned is not None and 'deleted' not in c.cleaned.lower():
                docs.append(analyser(c.header))

                sentences = re.split(
                    r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', c.cleaned)

                docs.extend([analyser(sent) for sent in sentences])

    return docs
Esempio n. 24
0
def old_vectorizer_glove(list_documents, labels, D, V, u_plus_v=False):
    X = None
    Y = []
    # see https://scikit-learn.org/stable/modules/feature_extraction.html
    vectorizer = TfidfVectorizer()
    analyze = vectorizer.build_analyzer()

    j = 0
    for d, y in zip(list_documents, labels):

        document = []
        i = 0
        for w in analyze(d):
            try:
                if u_plus_v:
                    glove = (V['u'][D[w]] + V['v'][D[w]]) / 2
                else:
                    glove = V['u'][D[w]]
                document.append(glove)
            except KeyError as e:
                i = i + 1

        #if i>0:
        #    print("missing words " + str(i) + " / " + str(len(d)))

        #if len(mean_d == 0):
        #    raise Exception("Empty mean_d")

        if len(document) > 0:
            mean_d = np.mean(document, axis=0)
            if X is None:
                X = mean_d
            else:
                #pdb.set_trace()
                X = np.vstack([X, mean_d])
            Y.append(y)

        j = j + 1
        if j % 1000 == 0:
            print(j)

    return X, np.array(Y)
Esempio n. 25
0
    def train(self, segments, ignore_before=4, ignore_after=4):
        '''
        This uses the 20newsgroups dataset for idf
        
        Parameters:
        :segments: list of strings where each string is a segment
        '''        
        data = fetch_20newsgroups(subset='train').data
        stripped_data = []
        
        for d in data:
            lines = d.split('\n')
            if len(lines)>ignore_before+ignore_after:
                stripped_data.append('\n'.join(lines[ignore_before:-ignore_after]))
        
        txt = ''.join(segments)
        stripped_data.append(txt)
        
        # Train corpus tf-idf
        tfidf_corpus = TfidfVectorizer(stop_words='english')
        tfidf_corpus.fit(stripped_data)
        book_scores = tfidf_corpus.transform([txt])
        print 'Learned {} features CORPUS'.format(len(tfidf_corpus.get_feature_names()))
        
        # Train document segment-wise tf-idf 
        tfidf_book = TfidfVectorizer(vocabulary=tfidf_corpus.vocabulary_)
        segment_scores = tfidf_book.fit_transform(segments)
        print 'Learned {} features BOOK'.format(len(tfidf_book.get_feature_names()))

        # Now get word scores in each segment
        final_scores = book_scores.multiply(segment_scores)

        idx_to_word = tfidf_corpus.get_feature_names()
        word_scores = []
        for i, segment_scores in enumerate(final_scores):
            scores = {}
            for j in segment_scores.indices:        
                scores[idx_to_word[j]] = segment_scores[0, j]
            word_scores.append(scores)
            
        self.word_scores = word_scores
        self.analyze = tfidf_corpus.build_analyzer()
Esempio n. 26
0
def keyword_extractor_tfidf(corpus_list,is_stop_words_allowed,n_gram_min,n_gram_max):
    if n_gram_min > n_gram_max:
        raise Exception('Invalid input n_gram_min should be <= n_gram_max')
    corpus = []
    for doc in corpus_list:
        text = ''
        for word in doc:
            text = text +' '+ word
        corpus.append(text)
    if is_stop_words_allowed == False:     
        vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max),stop_words='english')
    else:
        vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max))
    analyzer = vectorizer.build_analyzer()
    analyzer(corpus[0])
    features_array = vectorizer.fit_transform(corpus).toarray()
    features_transform_list = features_array.tolist()[0]
    features_dictionary = dict(zip(vectorizer.get_feature_names(),features_transform_list))
    sorted_features_dictionary = OrderedDict(sorted(features_dictionary.items(),key=itemgetter(1)))
    return sorted_features_dictionary  
Esempio n. 27
0
def tfidf(text):
    vectorizer = TfidfVectorizer()
    transformer = TfidfTransformer()
    countVector = vectorizer.fit_transform(text)
    mat = transformer.fit_transform(countVector.toarray()).toarray()
    analyze = vectorizer.build_analyzer()

    threshold = 0.0001
    key_dict = {}
    for i in range(len(text)):
        tokens = analyze(text[i])
        for j in range(len(tokens)):
            if mat[i][j] > threshold:
                key_dict[tokens[j]] = mat[i][j]
    l1, l2 = [], []
    s = [(k, key_dict[k]) for k in sorted(key_dict, key=key_dict.get)]
    for k, v in s:
        l1.append(k)
        l2.append(v)
    return l1, l2
Esempio n. 28
0
 def fit(self, templates):
     if self.vocabulary and self.analyser:
         pass
     else:
         vectorizer = TfidfVectorizer(
             ngram_range=(self.config_dict["min_n_gram"],
                          self.config_dict["max_n_gram"]),
             lowercase=True,
             stop_words=None,
             min_df=1)
         vectorizer.fit(templates)
         self.analyser = vectorizer.build_analyzer()
         self.vocabulary = vectorizer.vocabulary_
         save_object(
             os.path.join(self.feature_extraction_folder,
                          "analyzer.pickle"), self.analyser)
         save_object(
             os.path.join(self.feature_extraction_folder,
                          "vocabulary.pickle"), self.vocabulary)
         inputs = self.feature_engineering(templates)
         self.maxlen = max(max(len(x) for x in inputs), self.maxlen)
Esempio n. 29
0
def preprocessAll(filename, percent):
    vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=None)
    analyzer = vectorizer.build_analyzer()
    all_text = []
    line_cnt = 0
    with open(filename, 'r') as f:
        for line in f:
            if line_cnt % 10000 == 0:
                sys.stdout.flush()
                sys.stdout.write(" " * 25 + '\r')
                sys.stdout.flush()
                sys.stdout.write(str(line_cnt) + " lines processed.\r")
            line_cnt += 1
            # if line_cnt >= 2000000:
            #     break
            preprocessed = preprocess(line, analyzer)
            all_text.append(preprocessed)
    
    line_cnt = int(len(all_text)*percent)

    return all_text[0:line_cnt]
Esempio n. 30
0
def ida(articles):
    stopwords = []
    doc_terms = []
    with open('ch_stopwords.txt', 'r') as f:
        stopwords = set(f.read().lower().split('\n'))

    #print('stopwords', stopwords[:10])

    trigram_vectorizer = TfidfVectorizer(
        ngram_range=(2, 3),
        token_pattern=r'([\u4e00-\u9fa5]{1}|)',
        min_df=10,
        max_df=20,
        stop_words=stopwords,
        analyzer='word')
    analyzer = trigram_vectorizer.build_analyzer()
    '''
	for article in articles:
		terms = map(lambda x: x.replace(' ', ''), analyzer(article.Content)) #get rid of spaces
		#terms = set(map(lambda x: x.replace(' ', ''), analyzer(article.Content))) #get rid of spaces
		#terms = list(terms-stopwords)
		doc_terms.append(list(terms))
	'''
    article_contents = map(lambda x: x.Content, articles)
    doc_terms = trigram_vectorizer.fit_transform(article_contents)
    tf_feature_names = trigram_vectorizer.get_feature_names()
    print(len(tf_feature_names), tf_feature_names[100:200])

    lda = LatentDirichletAllocation(n_topics=8,
                                    max_iter=200,
                                    evaluate_every=10,
                                    n_jobs=-1,
                                    verbose=1,
                                    learning_method='online')
    lda.fit(doc_terms)
    joblib.dump(lda, 'lda-n8-2.pkl', compress=1)
    print_top_words(lda, tf_feature_names, 10)

    #print(doc_terms.get_feature_names())
    '''
Esempio n. 31
0
def sentence_tokenizer(dataset_name="pascal"):
    """
    Parameters
    ----------
    dataset_name : string
        'memorability' or 'pascal' or 'clipart'

    Returns
    -------
    analyze : object
        breaks sentences into words using scikit-learn tokenizer
    vectorizer : object of class TfidfVectorizer
        see scikit-learn documentation
    """

    if dataset_name == "memorability":
        mat = scipy.io.loadmat("../../data/sentences/memorability_888_img_5_sent.mat")
        sentences = mat["memorability_sentences"]

    elif dataset_name == "pascal":
        mat = scipy.io.loadmat("../../data/sentences/pascal_1000_img_50_sent.mat")
        sentences = mat["pascal_sentences"]

    elif dataset_name == "clipart":
        mat = scipy.io.loadmat("../../data/sentences/clipart_500_img_48_sent.mat")
        sentences = mat["clipart_sentences"]

    # Build corpus
    corpus = list()
    for sent_group in sentences:
        corpus.append(" ".join([sent[0] for sent in sent_group]))

    ### Build tf-idf vectorizer ###

    # at-least three letters in word
    vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w\\w\\w+\\b")
    vectorizer.fit(corpus)
    analyze = vectorizer.build_analyzer()

    return analyze, vectorizer
Esempio n. 32
0
def preprocess(raw_docs,
               stopwords,
               min_df=3,
               min_term_length=2,
               ngram_range=(1, 1),
               apply_tfidf=True,
               apply_norm=True,
               tokenizer=custom_tokenizer):
    """
	Preprocess a list containing text documents stored as strings.
	"""
    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords,
                            lowercase=True,
                            strip_accents="unicode",
                            tokenizer=tokenizer,
                            use_idf=apply_tfidf,
                            norm=norm_function,
                            min_df=min_df,
                            ngram_range=ngram_range)
    X = tfidf.fit_transform(raw_docs)

    analyze = tfidf.build_analyzer()

    docs = [analyze(doc) for doc in raw_docs]

    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[v[term]] = term
    return (X, terms, tfidf, docs)
def create_vectorizer(sentences):
    #Create TF-IDF object
    stopword_list = read_in_csv(stopwords_file_path)
    stemmed_stopwords = [
        tokenize_and_stem(stopword)[0] for stopword in stopword_list
    ]
    stopword_list = stopword_list + stemmed_stopwords
    tfidf_vectorizer = TfidfVectorizer(max_df=0.90,
                                       max_features=200000,
                                       min_df=0.05,
                                       stop_words=stopword_list,
                                       use_idf=True,
                                       tokenizer=tokenize_and_stem,
                                       ngram_range=(1, 3))
    tfidf_vectorizer = tfidf_vectorizer.fit(sentences)
    tfidf_matrix = tfidf_vectorizer.transform(sentences)
    print(tfidf_matrix)
    dense_matrix = tfidf_matrix.todense()
    print(dense_matrix)
    print(tfidf_vectorizer.get_feature_names())
    analyze = tfidf_vectorizer.build_analyzer()
    print(analyze("To Sherlock Holmes she is always _the_ woman."))
    return (tfidf_vectorizer, tfidf_matrix)
Esempio n. 34
0
    def __init__(self, data, n_features=10, preprocess=False, jobs=1, verbose=True):
        self._clusters = None
        self._labels = []
        self._data = data
        self._verbose = verbose

        self._n_features = n_features
        if preprocess:
            analyzer = TfidfVectorizer.build_analyzer()
            ipp = InputPreprocessor(None)
            def preprocess(doc):
                return [ipp.normalise(word) for word in analyzer(doc)]

            vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=True, analyzer=preprocess)

        else:
            vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features,
                                     min_df=2, stop_words='english',
                                     use_idf=True)
        self._preprocessed_data = vectorizer.fit_transform(self._data)
        self._jobs = jobs
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450):

    #vocab = set(phrase_list)
    idf_dic = {}
    #print "phrase list len", len(phrase_list)
    #print "len idf_vec", len(idf_vec)
    for i, phrase in enumerate(phrase_list):
        idf_dic[phrase] = idf_vec[i]
    noun_phrases = set()
    print "--extracting NP"
    noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])

    vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
    analyzer = vectorizer.build_analyzer()
    phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)]))
    doc = preprocess(doc)

    #print "candidate phrases", phrases
    #tfidf = []
    #first_occurrence = []
    #entropy = []
    #length = []
    doc_len = len(doc)

    entropy = get_entropy_doc(doc, phrases)
    # get feature vectors
    features = []
    for i, phrase in enumerate(phrases):
        first_occurrence = doc.find(phrase) / doc_len
        tf = doc.count(phrase)
        if phrase in idf_dic:
            tfidf = tf * idf_dic[phrase]
        else:
            tfidf = tf * log10(training_size)
        feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i])
        features.append(feature_vec)
    return phrases, features
sentences = scipy.io.loadmat('../../data/sentences/memorability_888_img_5_sent.mat')
sentences = sentences['memorability_sentences']

f = open('../../automated_specificity.txt', 'w')

sent_pairs, scores_w = list(), list()

vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')
corpus = list()

# Build corpus
for sent_group in sentences:
    corpus.append(' '.join([sent[0] for sent in sent_group]))

vectorizer.fit(corpus)
analyze = vectorizer.build_analyzer()

specificity_max, specificity_w = list(), list()
for im_idx, sentence_group in enumerate(sentences):

    similarity_max, similarity_w = list(), list()
    for (sent1, sent2) in combinations(sentence_group, 2):

        words1, words2 = analyze(sent1[0]), analyze(sent2[0])

        sent1_weights = [vectorizer.transform(sent1).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words1]
        sent2_weights = [vectorizer.transform(sent2).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words2]

        print >> f, [w.encode('utf-8') for w in words1]
        print >> f, [PrettyFloat(w) for w in sent1_weights]
        print >> f, [w.encode('utf-8') for w in words2]
        data_full.append(SiteData('fb/srsplit/fullfbsearch_results_combined{i:02d}'.format(i=file_counter),categories, full_candidate_dict))
"""
data_train = fetch_20newsgroups(subset='train', categories=categories,
                               shuffle=True, random_state=42)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)
"""
print 'data loaded'
import conversions as conv
from ersatzpg.utffile import utffile
special_terms = []
vocabulary = []
basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False,
                             stop_words='english')
basic_analyze = basic_vectorizer.build_analyzer()
with utffile('searchterms.csv') as f:
    for s in f:
        if s.startswith('<'):
            special_terms.append(s.strip('<>'))
        else:
            vocabulary.append(s.decode('utf-8').strip())
fb_page_data = {}
with open('fb/facebookpolsurls_bkp.csv') as f:
    csvr = csv.DictReader(f)
    for l in csvr:
        fb_page_data.update({l['url']:{'fans':l['Fan Count'].replace(',',''),'authentic':l['Authentic Category']}})

def analyze(s):
    d=eval(s)
    special_keys = []
Esempio n. 38
0
 def build_analyzer(self):
     analyzer = TfidfVectorizer.build_analyzer(self)
     english_stemmer = SnowballStemmer('english')
     return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))
Esempio n. 39
0
	def __init__(self, n_features, voc_file):
		self.n_features = n_features
		self.voc_file = voc_file
		self.word_clusters, self.grouped_words = self.read_word_cluster(voc_file)
		tfidf = TfidfVectorizer(encoding = 'iso-8859-1', stop_words='english')
		self.vectorize = tfidf.build_analyzer()
Esempio n. 40
0
class WeightedEmbeddingSearch:
    def __init__(self):
        print("Loading data csv")
        #fun_fact_title_data = pd.read_csv(FUN_FACT_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        til_title_data = pd.read_csv(TIL_TITLE_CSV).dropna(
            subset=REQUIRED_COLUMNS)
        #ysk_title_data = pd.read_csv(YSK_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)

        self.title_data = pd.concat(
            [
                #fun_fact_title_data,
                til_title_data,
                #ysk_title_data,
            ],
            join='inner').reset_index(drop=True)

        print("Computing tf-idf matrix")
        self.vectorizer = TfidfVectorizer(stop_words='english',
                                          dtype=np.float32)
        tfidf_matrix = self.vectorizer.fit_transform(self.title_data["title"])

        print("Loading spacy")
        self.nlp = spacy.load('en_core_web_lg')

        print("Computing weighted embeddings")
        features = self.vectorizer.get_feature_names()
        self.f_vectors = np.array([self.nlp.vocab[f].vector for f in features])
        weighted_embeddings = tfidf_matrix.dot(self.f_vectors)
        assert weighted_embeddings.shape == (len(self.title_data.index), 300)
        self.n_weighted_embeddings = weighted_embeddings / (
            np.linalg.norm(weighted_embeddings, axis=1)[:, np.newaxis] + EPS)

        #print("Compressing pandas dataframe into index")
        #self.index = list(title_data.itertuples())

        print("Done loading {} rows".format(len(self.title_data.index)))

    def search(self, query, method='similarity', top=10):
        query_tfidf = self.vectorizer.transform([query])
        if query_tfidf.count_nonzero() > 0:
            query_weighted = query_tfidf.dot(self.f_vectors).flatten()
        # average word embeddings if query words don't exist in our corpus (tfidf matrix)
        else:
            tokens = self.vectorizer.build_analyzer()(query)
            # query was all stopwords, so we'll have to manually tokenize
            if not tokens:
                tokens = query.lower().split()
            query_weighted = np.average(
                [self.nlp.vocab[t].vector for t in tokens], axis=0).flatten()

        # if we have no embeddings for the given query, we're out of luck
        if np.count_nonzero(query_weighted) == 0:
            return []

        n_query_weighted = query_weighted / (np.linalg.norm(query_weighted) +
                                             EPS)
        rankings = self.n_weighted_embeddings.dot(n_query_weighted)
        rankings_index = np.argsort(-rankings)
        ranked_df = self.title_data.loc[rankings_index]
        ranked_titles = list(ranked_df['title'])
        ranked_scores = list(ranked_df['score'])
        top_ranked_em = self.n_weighted_embeddings[rankings_index]
        ranked_rankings = rankings[rankings_index]
        results = self.kMeans(ranked_titles, ranked_scores, ranked_rankings,
                              top_ranked_em, method)

        #         index = list(ranked_df.itertuples())
        results = [{
            "type": "submission",
            "title": ranked_df.iloc[d]["title"],
            "subreddit": ranked_df.iloc[d]['subreddit'],
            "permalink": ranked_df.iloc[d]['permalink'],
            "score": ranked_df.iloc[d]['score']
        } for d in [i[1][0] for i in results]]
        return results

    def kMeans(self, titles, scores, rankings, embeddings, method):
        TOP_HITS_KMEANS = max(40, np.sum(scipy.stats.zscore(rankings) > 3.5))
        if TOP_HITS_KMEANS > 200:
            TOP_HITS_KMEANS = 200
        kmeans = KMeans(n_clusters=20,
                        random_state=0).fit(embeddings[:TOP_HITS_KMEANS])

        counter = collections.Counter(kmeans.labels_)
        most_common = counter.most_common(10)
        most_common = set([i[0] for i in most_common])
        results = self.topSimOfEachCluster(kmeans.labels_, 10, most_common)
        self.topScoreOfEachCluster(results, 4, scores)
        results = self.topResultsSorted(results, rankings, scores, method)
        return results

    # cluster number to top num based on similarity
    def topSimOfEachCluster(self, cluster_labels, num, most_common):
        res = {}
        clusters_included = set(most_common)
        for i, el in enumerate(cluster_labels):
            if el not in clusters_included:
                continue
            if el not in res:
                res[el] = [i]
            elif len(res[el]) < num:
                res[el].append(i)
        return res

    #takes topOfEachCluster and gets the top num by score
    def topScoreOfEachCluster(self, sim_results, num, scores):
        for key in sim_results:
            sim_results[key].sort(key=lambda x: scores[x], reverse=True)
            sim_results[key] = sim_results[key][:num]

    #sort results by method
    def topResultsSorted(self, results, rankings, scores, method='similarity'):
        if method == 'similarity':
            for key in results:
                results[key].sort(key=lambda x: rankings[x],
                                  reverse=True)  #sorts within a cluster
                sorted_results = sorted(results.items(),
                                        key=lambda x: rankings[x[1][0]],
                                        reverse=True)  #sorts all clusters
        elif method == 'score':
            for key in results:
                results[key].sort(key=lambda x: scores[x], reverse=True)
                sorted_results = sorted(results.items(),
                                        key=lambda x: scores[x[1][0]],
                                        reverse=True)
        return sorted_results
def main():

    reload(sys)
    sys.setdefaultencoding('utf-8')

    pprint(LemmaTokenizer()("this is testing the stemming functionality"))


    param_grid = [
        {'C': [.125, .25, .5, 1, 10, 100, 1000]},
        { 'penalty': ('l1','l2')}
    ]

    svm_param_grid = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    ]

    lines = [line for line in fileinput.input()]

    sentences = map(lambda x: x.split('\t')[1], lines)
    Y =  map(lambda x: int(x.split('\t')[0]), lines)

    vectorizer = TfidfVectorizer(min_df=1,
                                 tokenizer=POSTokenizer(),
                                 preprocessor=preprocess_sentence,
                                 ngram_range=(2,2),
                                 stop_words='english')

    pipeline = Pipeline([
        ('vect', vectorizer),
        ('clf', SGDClassifier()),
    ])

    # pprint(parameters)
    # t0 = time()
    # grid_search.fit(sentences, Y)
    # print("done in %0.3fs" % (time() - t0))
    # print()

    # print("Best score: %0.3f" % grid_search.best_score_)

    X = vectorizer.fit_transform(sentences)
    num_samples = len(Y)
    num_train = int(num_samples * .8)
    print "Num training: %d" % num_train
    X_train = X[0:num_train]
    Y_train = Y[0:num_train]
    X_test  = X[num_train:]
    Y_test = Y[num_train:]
    analyze = vectorizer.build_analyzer()

    for sentence in sentences[0:10]:
        print preprocess_sentence(sentence)
        print analyze(sentence)
        print "LemmaTokenizer" +  str(LemmaTokenizer()(sentence))
        print StemmingTokenizer()(sentence)

    # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    # tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    # chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    logistic = linear_model.LogisticRegression(C=.5, class_weight=None, dual=False,
                                               fit_intercept=True, intercept_scaling=1, max_iter=100,
                                               multi_class='ovr', penalty='l2', random_state=None,
                                               solver='liblinear', tol=0.0001, verbose=0)

    # grid_search = GridSearchCV(SVC(), svm_param_grid, n_jobs=-1, verbose=1)
    # grid_search.fit(X_train, Y_train)
    # print grid_search.score(X_test, Y_test)
    # best_parameters = grid_search.best_estimator_.get_params()
    # print best_parameters

    # grid_search = GridSearchCV(logistic, param_grid, n_jobs=-1, verbose=1)
    # grid_search.fit(X_train, Y_train)
    # print grid_search.score(X_test, Y_test)
    # best_parameters = grid_search.best_estimator_.get_params()
    # print best_parameters

    print logistic.fit(X_train,Y_train).score(X_test,Y_test)

    show_most_informative_features(vectorizer, logistic, 25)

    num_errors = 0

    feature_names = vectorizer.vocabulary_
    feature_index = inv_map = {v: k for k, v in feature_names.items()}
    y_pred = []
    for (i,x) in enumerate(X_test):
        y_hat = logistic.predict(x)
        y_pred.append(y_hat)
        if y_hat != Y_test[i]:
            num_errors += 1
            print "\n\nError predicting sentence: " + sentences[i + num_train]
            print print_features(x, feature_index)
            print "Label: " + str(Y_test[i])
    error_rate = float(num_errors) / len(Y_test)
    print "Accuracy : " + str(1 - error_rate)
Esempio n. 42
0
def main():
    global X
    logging.info('Started')
    # pulling primary bill sponsor to match with party information 
    sponsors_query = db.bills_details.find({},
        {'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1, 
                  'action_dates.signed': 1}).limit(25) #able to limit number of records for testing

    sponsors = list(sponsors_query)
    bill_party = []
    # sponsors[0]['sponsors'][0]
    # Creates list of dict: bill database ID, passed status, legislator ID and party 
    for i in range(len(sponsors)):
        bill_dbid = sponsors[i]['_id']
        leg_id = sponsors[i]['sponsors'][0]['leg_id']
       
        if leg_id == None: 
            leg_id = 'CA0000'
            party = sponsors[i]['sponsors'][0]['name']
        else: 
            party = GetParty(leg_id)
            if party == None:
                party = sponsors[i]['sponsors'][0]['name']
       
        if sponsors[i]['action_dates']['signed'] == None:
            bill_signed = False
        else:
            bill_signed = True

        k = ['id', 'leg_id', 'party','passed']
        v = [bill_dbid, leg_id, party, bill_signed]
        bill_party.append(dict(zip(k,v)))

    logging.info('populated list of sponsor and party')    
    # note to self/presentation: show number of bills sponsored by non-legislators
    # graph bills by party that passed .....     

    # Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts
    # all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25))

    #adds vectorized features of bigrams using function
    # for i in range(len(bill_party)):
    #     vec = GetBigramsVector(bill_party[i]['id'])
    #     bill_party[i]['vec'] = vec
    # logging.info('loaded vectorized bigrams')

    bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1)
    analyze = bigram_vectorizer.build_analyzer()

    for i in range(len(bill_party)):
        #oid = bill_party[i]['id']
        #print "Getting text for item", i, bill_party[i]['id']
        leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text']
        raw = nltk.clean_html(leg_text)
        # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
        bigram_features = analyze(raw)
        bill_party[i]['features'] = bigram_features
        bill_party[i]['raw'] = raw
        # bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray()
    
    party_options = {'democratic': 0, 'republican': 1}
    X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options])
    print bigram_vectorizer
    logging.info('loaded tfidf vectorized bigrams')

    # Creates numpy arrays, results = party and features = vectorized words  
    # party only = democrat or republican and vectorized text
    bp_target = []
    bp_data = []
    for i in range(len(bill_party)):
        if bill_party[i]['party'].lower() in ('democratic', 'republican'): 
            bp_target.append( party_options[bill_party[i]['party'].lower()] )            
        else:
            continue

    targets = np.array(bp_target)
    data = X.toarray()
    
    #=====================================================================================
    # Train different models - Linear, Logistic, Random Linear
    #=====================================================================================

    #  Supported Vector Classification
    logging.info('Linear Support Vector Classification')
    clf = LinearSVC(loss='l2')
    print clf
    clf = clf.fit(data,targets)
    print 'LinearSVC Coef', clf.coef_
    print 'LinearSVC Intercept', clf.intercept_
    print 'LinearSVC Score/R2', clf.score(data,targets)

    with open('party_linearSVC.pkl', 'wb') as mclf:
        pickle.dump(clf, mclf)
    logging.info('output LinearSVC to party_linearSVC.pkl')

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, targets, test_size=0.4, random_state=0)
    clfCV = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    print clfCV
    print 'training shape', X_train.shape, y_train.shape
    print 'testing shape', X_test.shape, y_test.shape
    print 'Test Score', clfCV.score(X_test, y_test)
    print 'Train Score', clfCV.score(X_train, y_train)

    # Logistic Regression 
    logging.info('Logistic Regression')
    # Insert GridSearch Here
    logreg_l1 = linear_model.LogisticRegression(C=1.0, penalty='l1')
    logreg_l2 = linear_model.LogisticRegression(C=1.0, penalty='l2')
    logreg_l1.fit(data,targets)
    logreg_l2.fit(data,targets)

    print logreg_l1
    print logreg_l2
    print 'Pseudo-R2 penalty l1', logreg_l1.score(data,targets)
    print 'Pseudo-R2 penalty l2', logreg_l2.score(data,targets)
    print 'LogReg l1 Coef', logreg_l1.coef_
    print 'LogReg l1 Intercept', logreg_l1.intercept_

    with open('party_logreg_l1.pkl', 'wb') as lr1:
        pickle.dump(logreg_l1, lr1)
    logging.info('output Logistic regression to party_logreg_l1.pkl')

    with open('party_logreg_l2.pkl', 'wb') as lr2:
        pickle.dump(logreg_l2, lr2)
    logging.info('output Logistic regression to party_logreg_l2.pkl')

    # Random Forests
    # See other python file

    logging.info('Finished')
Esempio n. 43
0
    def build_analyzer(self):
        analyzer = TfidfVectorizer.build_analyzer(self)

        return lambda doc: (StemmedTfidfVectorizer.english_stemmer.stem(w) for w in analyzer(doc))
Esempio n. 44
0
def main():
    global X
    logging.info('Started')
    # pulling primary bill sponsor to match with party information 
    sponsors_query = db.bills_details.find({},
        {'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1, 
                  'action_dates.signed': 1}) #able to limit number of records for testing

    sponsors = list(sponsors_query)
    bill_party = []
    # sponsors[0]['sponsors'][0]
    # Creates list of dict: bill database ID, passed status, legislator ID and party 
    for i in range(len(sponsors)):
        bill_dbid = sponsors[i]['_id']
        leg_id = sponsors[i]['sponsors'][0]['leg_id']
       
        if leg_id == None: 
            leg_id = 'CA0000'
            party = sponsors[i]['sponsors'][0]['name']
        else: 
            party = GetParty(leg_id)
            if party == None:
                party = sponsors[i]['sponsors'][0]['name']
       
        if sponsors[i]['action_dates']['signed'] == None:
            bill_signed = False
        else:
            bill_signed = True

        k = ['id', 'leg_id', 'party','passed']
        v = [bill_dbid, leg_id, party, bill_signed]
        bill_party.append(dict(zip(k,v)))

    logging.info('populated list of sponsor and party')    
    # note to self/presentation: show number of bills sponsored by non-legislators
    # graph bills by party that passed .....     

    # Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts
    # all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25))

    #adds vectorized features of bigrams using function
    # for i in range(len(bill_party)):
    #     vec = GetBigramsVector(bill_party[i]['id'])
    #     bill_party[i]['vec'] = vec
    # logging.info('loaded vectorized bigrams')

    bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1)
    analyze = bigram_vectorizer.build_analyzer()

    for i in range(len(bill_party)):
        #oid = bill_party[i]['id']
        #print "Getting text for item", i, bill_party[i]['id']
        leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text']
        raw = nltk.clean_html(leg_text)
        # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
        bigram_features = analyze(raw)
        bill_party[i]['features'] = bigram_features
        bill_party[i]['raw'] = raw
        # bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray()
    
    party_options = {'democratic': 0, 'republican': 1}
    X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options])
    print bigram_vectorizer
    logging.info('loaded tfidf vectorized bigrams')

    # Creates numpy arrays, results = party and features = vectorized words  
    # party only = democrat or republican and vectorized text
    bp_target = []
    bp_data = []
    for i in range(len(bill_party)):
        if bill_party[i]['party'].lower() in ('democratic', 'republican'): 
            bp_target.append( party_options[bill_party[i]['party'].lower()] )            
        else:
            continue

    targets = np.array(bp_target)
    data = X.toarray()

    #====================================================================================
    # Random Forests Modeling and Plotting 
    #===================================================================================
    
    # Parameters
    n_classes = 2
    n_estimators = 30
    plot_colors = "ryb"
    cmap = pl.cm.RdYlBu
    plot_step = 0.02  # fine step width for decision surface contours
    plot_step_coarser = 0.5  # step widths for coarse classifier guesses
    RANDOM_SEED = 9  # fix the seed on each iteration ???

    plot_idx = 1

    models = [DecisionTreeClassifier(max_depth=None),
              RandomForestClassifier(n_estimators=n_estimators),
              ExtraTreesClassifier(n_estimators=n_estimators),
              AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                 n_estimators=n_estimators)]

   
    for model in models:
        # We use all the features where the SKLEARN example choose specific ones
        X = data
        y = targets

        # Shuffle
        idx = np.arange(X.shape[0])
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # Standardize
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X = (X - mean) / std

        # Train
        clf = clone(model)
        clf = model.fit(X, y)

        scores = clf.score(X, y)
        # Create a title for each column and the console by using str() and
        # slicing away useless parts of the string
        model_title = str(type(model)).split(".")[-1][:-2][:-len("Classifier")]
        model_details = model_title
        if hasattr(model, "estimators_"):
            model_details += " with {} estimators".format(len(model.estimators_))
        print model_details + " with all features has a score of", scores

    ###################### Commented out plotting ############################################
    #     pl.subplot(3, 4, plot_idx)
    #     if plot_idx <= len(models):
    #         # Add a title at the top of each column
    #         pl.title(model_title)

    #     # Now plot the decision boundary using a fine mesh as input to a
    #     # filled contour plot
    #     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    #     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    #     xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
    #                          np.arange(y_min, y_max, plot_step))

    #     # Plot either a single DecisionTreeClassifier or alpha blend the
    #     # decision surfaces of the ensemble of classifiers
    #     if isinstance(model, DecisionTreeClassifier):
    #         Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    #         Z = Z.reshape(xx.shape)
    #         cs = pl.contourf(xx, yy, Z, cmap=cmap)
    #     else:
    #         # Choose alpha blend level with respect to the number of estimators
    #         # that are in use (noting that AdaBoost can use fewer estimators
    #         # than its maximum if it achieves a good enough fit early on)
    #         estimator_alpha = 1.0 / len(model.estimators_)
    #         for tree in model.estimators_:
    #             Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
    #             Z = Z.reshape(xx.shape)
    #             cs = pl.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)

    #     # Build a coarser grid to plot a set of ensemble classifications
    #     # to show how these are different to what we see in the decision
    #     # surfaces. These points are regularly space and do not have a black outline
    #     xx_coarser, yy_coarser = np.meshgrid(np.arange(x_min, x_max, plot_step_coarser),
    #                                          np.arange(y_min, y_max, plot_step_coarser))
    #     Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), yy_coarser.ravel()]).reshape(xx_coarser.shape)
    #     cs_points = pl.scatter(xx_coarser, yy_coarser, s=15, c=Z_points_coarser, cmap=cmap, edgecolors="none")

    #     # Plot the training points, these are clustered together and have a
    #     # black outline
    #     for i, c in zip(xrange(n_classes), plot_colors):
    #         idx = np.where(y == i)
    #         pl.scatter(X[idx, 0], X[idx, 1], c=c, label=iris.target_names[i],
    #                    cmap=cmap)

    #     plot_idx += 1  # move on to the next plot in sequence

    # pl.suptitle("Classifiers on feature subsets of the Party Words dataset")
    # pl.axis("tight")

    # pl.show()

    logging.info('Finished')
Esempio n. 45
0
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train);
tfidf_test = tfidftransformer.fit(counts_test).transform(counts_test);

#或者让两个tf-idf共享vocabulary
#method 2:TfidfVectorizer
print '*************************\nTfidfVectorizer\n*************************'
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(sublinear_tf = True,
                                    max_df = 0.5,
                                    stop_words = 'english');
tfidf_train_2 = tv.fit_transform(newsgroup_train.data);
tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
tfidf_test_2 = tv2.fit_transform(newsgroups_test.data);
print "the shape of train is "+repr(tfidf_train_2.shape)
print "the shape of test is "+repr(tfidf_test_2.shape)
analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms

#(准确率*召回率)/(准确率+召回率)
def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred);
    m_recall = metrics.recall_score(actual,pred);
    print 'predict info:'
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall);
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));

#或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
Esempio n. 46
0
data_train = fetch_20newsgroups(subset='train', categories=categories,
                               shuffle=True, random_state=42)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                              shuffle=True, random_state=42)
"""
print 'data loaded'
import conversions as conv
from ersatzpg.utffile import utffile
special_terms = []
vocabulary = []
basic_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                   max_df=0.5,
                                   use_idf=False,
                                   stop_words='english')
basic_analyze = basic_vectorizer.build_analyzer()
with utffile('searchterms.csv') as f:
    for s in f:
        if s.startswith('<'):
            special_terms.append(s.strip('<>'))
        else:
            vocabulary.append(s.decode('utf-8').strip())


def analyze(s):
    d = eval(s)
    special_keys = []
    name = d['name']
    electoral_district_type = d['electoral_district_type']
    electoral_district_name = d['electoral_district_name']
    state = d['state']
Esempio n. 47
0
 def build_analyzer(self):  # 注释掉即成为普通TfidfVectorizer
     # analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
     analyzer = TfidfVectorizer.build_analyzer(self)
     return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
Esempio n. 48
0
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]])

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2,
                             ngram_range=(1, 2),
                             stop_words='english',
                             strip_accents='unicode',
                             norm='l2')

test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)






svm_classifier = LinearSVC().fit(X_train, y_train)



y_svm_predicted = svm_classifier.predict(X_test)
# In[40]:

from sklearn.metrics.pairwise import cosine_similarity
# A short example using the sentences above
words_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

get_ipython().magic(u'time words_matrix = words_vectorizer.fit_transform(sents) #fit the vectorizer to synopses')

# (2, 18) means the matrix has 2 rows (two sentences) and 18 columns (18 terms)
print(words_matrix.shape)
print(words_matrix)

# this is how we get the 18 terms
analyze = words_vectorizer.build_analyzer()
print(analyze("Today (May 19, 2016) is his only daughter's wedding."))
print(analyze("Vito Corleone is the Godfather."))
print(analyze("Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception."))
all_terms = words_vectorizer.get_feature_names()
print(all_terms)
print(len(all_terms))

# sent 1 and 2, similarity 0, sent 1 and 3 shares "his", sent 2 and 3 shares Vito - try to change Vito's in sent3 to His and see the similary matrix changes
example_similarity = cosine_similarity(words_matrix)
example_similarity


# Now onto the fun part. Using the tf-idf matrix, you can run a slew of clustering algorithms to better understand the hidden structure within the synopses. I first chose k-means. K-means initializes with a pre-determined number of clusters (I chose 5). Each observation is assigned to a cluster (cluster assignment) so as to minimize the within cluster sum of squares. Next, the mean of the clustered observations is calculated and used as the new cluster centroid. Then, observations are reassigned to clusters and centroids recalculated in an iterative process until the algorithm reaches convergence.
# 
# I found it took several runs for the algorithm to converge a global optimum as k-means is susceptible to reaching local optima - how to decide that the algorithm converged???