Esempio n. 1
4
    def genfeature(self, ls_x):
        '''
        a. Shallow features
	        1. number of words in the sentence (normalize)
	        2. average number of characters in the words
            3. percentage of stop words
	        4. minimum, maximum and average inverse document frequency
        :param ls_x: sencences X without label
        :return:
        '''

        vectorizer = TfidfVectorizer(stop_words='english',smooth_idf=True, sublinear_tf=False,
                 use_idf=True)
        tfidf = vectorizer.fit_transform(ls_x)
        array = tfidf.toarray()
        X = []
        append = X.append
        maxtoken = 0
        for idx,l in enumerate(ls_x):
            ws = l.split()
            maxtoken = max(len(ws),maxtoken)
            try:
                stops = round(reduce(lambda x,y: x+1 if y in self.tweetmanager.stop else x, ws,0)/(len(ws)+1e-10),2)
            except:
                pass

            append([len(ws),self.avgch(ws), stops,
                    min(array[idx]), max(array[idx]), sum(array[idx])/len(array[idx])])

        return [[round(x[0]*1.0/maxtoken,2)] + x[1:]  for x in X]
Esempio n. 2
0
def cosine_similarity(city1_content, city2_content):
    """Determines the tf-idf (term frequency-inverse document frequency) and then
    calculates the cosine similarity between between the two Wikipedia pages."""

    vectorizer = TfidfVectorizer(tokenizer=generate_stemmed_tokens, stop_words='english')
    tfidf = vectorizer.fit_transform([city1_content, city2_content])
    return ((tfidf * tfidf.T).A)[0, 1]
Esempio n. 3
0
File: lr.py Progetto: chen33/nlp
def getData():
	train_data= load_files('dataset/train')    
	test_data=load_files("dataset/test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)
	return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
def trainClassifier(classifier, X, y):
	vectorizer = TfidfVectorizer(analyzer='char', use_idf=True, sublinear_tf=True, stop_words='english', ngram_range=(1,3), lowercase=True)
	# vectorizer = TfidfVectorizer()
	X = vectorizer.fit_transform(X).toarray()
	if classifier == "SVC":
		clf = LinearSVC()
		# parameters = {'kernel':['linear', 'rbf'], 'C':[0.1, 1, 10]}
		# clf = grid_search.GridSearchCV(clf, parameters)
		clf.fit(X, y)
		# print clf.best_params_
		clf_vect = [clf, vectorizer]
		f = open('svm.pkl', 'wb')
		pickle.dump(clf_vect, f)
		return clf, vectorizer
	elif classifier == "RF":
		clf = RandomForestClassifier()
		clf.fit(X, y)
		return clf, vectorizer
	elif classifier == "MNB":
		clf = MultinomialNB()
		clf.fit(X, y)
		return clf, vectorizer
	elif classifier == "LDA":
		clf = LDA()
		clf.fit(X, y)
		return clf, vectorizer
	elif classifier == "KNN":
		clf = KNeighborsClassifier()
		clf.fit(X, y)
		return clf, vectorizer
def tfidf_vectorize(corpus) :    
    ## This object does all the job. For more information about
    ## the semantics of the arguments, please read the documentation at
    ## http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    ## TfidfVectorizer takes care of the stop-words.
    corpus_specific_stopwords.extend(ENGLISH_STOP_WORDS)
    vectorizer = TfidfVectorizer(max_df = 0.5, 
                                 token_pattern='[A-Za-z]{3,}', # we restrict to words with 3 or more characters
                                 min_df = 50, # words occurring less than this number of times are discarded
                                 stop_words= corpus_specific_stopwords, # We use a standard set of stop words for the English language plus some words we already identified
                                 use_idf = True # Use tf/idf, clusterIdx.e., term frequency divided by the term's document frequency
                                 )    
    ## We store the documents and the text of the articles in different lists.
    texts = []
    documents = []

    ## Iterating the corpus
    for doc in corpus :
        texts.append(doc.text)
        ## We store the title and the category as a pair
        ## The category is the topic of the article. We will use
        ## it as ground truth when calculating purity and entropy
        documents.append(doc)
        
    ## This call constructs the document-term matrix. It returns a sparse matrix of
    ## type http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html
    doc_term_matrix = vectorizer.fit_transform(texts);
    ## We return the ground truth clustering, the document-term matrix and the vectorizer object
    return documents, doc_term_matrix, vectorizer
    def get_data(self, abstract=False):
        data = self.mongo.get_all(order_by='id_doc')
        data = [doc for doc in data]

        if abstract:
            only_text = self.get_data_with_abstract(data)
        else:
            only_text = [doc['text'] for doc in data]

        only_labels = [doc['label'] for doc in data]
        tfidf_vectorizer = TfidfVectorizer(max_df=0.5,
                                           max_features=200000,
                                           min_df=2,
                                           stop_words='english',
                                           strip_accents='unicode',
                                           use_idf=True,
                                           ngram_range=(1, 1),
                                           norm='l2',
                                           tokenizer=TextUtils.tokenize_and_stem)
        tfidf_matrix = tfidf_vectorizer.fit_transform(only_text)
        print 'After tfidf vectorizer: found %s documents and %s terms' \
              % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])
        dict_out = {}
        for l in sorted(set(only_labels)):
            dict_out[l] = {
                'docs': [],
                'fscore': ''
            }
        for doc in data:
            dict_out[doc['label']]['docs'].append(doc['id_doc'])

        return tfidf_matrix, dict_out
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    selector = SelectPercentile(f_classif, percentile=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
Esempio n. 8
0
def tfidf(synopses):
    tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
    terms=tfidf_vectorizer.get_feature_names()
    print("terms:",terms)
    print(tfidf_matrix.shape)
    return  terms,tfidf_matrix  # 返回tfidf矩阵
Esempio n. 9
0
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5):
    """
    Detail:
    Params:
        data_path: data directory
        save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents))
        min_df: lower bound
        max_df: upper bound
    """
    dataset = loadData(data_path)
    documents = []
    for counter, sample in enumerate(dataset):
        filename, contents, highlights = sample
        content_str = ""
        for content in contents:
            if content[-1] != ".":
                content += "."
            content_str += " " + content
        documents.append(content_str)

    tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english'))
    tf_idf_vectorizer.fit(documents)

    with open(save_path, mode="wb") as f:
        pickle.dump(tf_idf_vectorizer,f)

    print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
Esempio n. 10
0
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples):
  """ Test if accuracy for estimators with given parameters is
      significantly better than that of the first estimator in the tuple
  """
  texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
  vectorizer = TfidfVectorizer()
  text_similarity = cosine_similarity(vectorizer.fit_transform(texts))

  accuracy_arrays = []
  for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples:
    stop_idx = votes_per_doc * len(texts)
    # Now get n_runs accuracies and put then into numpy arrays
    accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
        vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) )
    accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) )

  # Baseline
  result_row = []
  result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) )
  # T-tests
  for accuracy_array in accuracy_arrays[1:]:
    _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False)
    significance_indicator = lambda p: "*" if p < 0.01 else " "
    is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " "
    result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better))

  return "|".join(result_row)
Esempio n. 11
0
def MMR(docs, count):
    # Setup
    select_lst = [docs.pop(0)]
    candidates = []
    tfidf_vectorizer = TfidfVectorizer()
    relevance_weight = 0.9

    # Start recalculating scores
    while len(select_lst) != len(docs):
        select_sen = []
        for i in select_lst:
            select_sen.append(i.sentence)

        for candidate in docs:
            old_score = candidate.rating

            stemmed_sen = stemming([candidate])
            stemmed_lst = stemming(select_lst)
            tfidf_matrix = tfidf_vectorizer.fit_transform(stemmed_lst)
            target = tfidf_vectorizer.transform(stemmed_sen)
            similarities = cosine_similarity(target,tfidf_matrix).flatten()
            similarities.sort()
            similarity = similarities[-1]
            
            new_score = old_score * relevance_weight - similarity * (1 - relevance_weight)

            candidate.rating = new_score
            
        docs = sorted(docs, key=attrgetter("rating"), reverse=True)
        select_lst.append(docs.pop(0))

    return select_lst
Esempio n. 12
0
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
Esempio n. 13
0
def kmeans(content_list):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, \
    lowercase=False)
    '''
    tokenizer: 指定分词函数
    lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理,
    所以最好是False
    '''
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_list)
    num_clusters = 20
    km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=8, \
                        init='k-means++',n_jobs=8)
    '''
    n_clusters: 指定K的值
    max_iter: 对于单次初始值计算的最大迭代次数
    n_init: 重新选择初始值的次数
    init: 制定初始值选择的算法
    n_jobs: 进程个数,为-1的时候是指默认跑满CPU
    注意,这个对于单个初始值的计算始终只会使用单进程计算,
    并行计算只是针对与不同初始值的计算。比如n_init=10,n_jobs=40, 
    服务器上面有20个CPU可以开40个进程,最终只会开10个进程
    '''
    #返回各自文本的所被分配到的类索引
    result = km_cluster.fit_predict(tfidf_matrix)
    print "Predicting result: ", result
    return result
def read_examples(filename, sparm):
    """Parses an input file into an example sequence."""
    # This reads example files of the type read by SVM^multiclass.
    examples = []
    text = []
    count = 0
    # Open the file and read each example.
    for line in file(filename):
        # Get rid of comments.
        if line.find('#'): line = line[:line.find('#')]
        target, tokens = line.split('::')[0], line.split('::')[1:]
        # If the line is empty, who cares?
        if not tokens: continue
        # Get the target.
        text[count] = target
        # Get the features.
        tokens = [t.split(':') for t in tokens]
        features = [(0,1)]+[(int(k),float(v)) for k,v in tokens]
        # Add the example to the list
        examples.append((svmapi.Sparse(features), count))
        count += 1
    # Print out some very useful statistics.
    vectorizer = TfidfVectorizer(stop_words='english')
    global tf_idf_transformed_matrix
    tf_idf_transformed_matrix = vectorizer.fit_transform(text)
    print len(examples),'examples read'
    return examples
Esempio n. 15
0
def createTDIDF():
    ## Bag of words
    with open("./data/movies.csv") as f:
        train_set1 = [line.lower().rstrip() for line in f]
    with open("./data/dvd.csv") as f:
        train_set2 = [line.lower().rstrip() for line in f]

    train_set = sorted(list(set(train_set1 + train_set2)))
    # Create dictionary to find movie
    dictTrain = dict()
    for i,movie in enumerate(train_set):
        dictTrain[movie] = i

    # Find weitghts
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

    ## Tri-grams
    lenGram  = 3
    train_setBigrams = []
    for mov in train_set:
        temp = [mov[i:i+lenGram] for i in range(len(mov)-1)]
        temp = [elem for elem in temp if len(elem) == lenGram]
        train_setBigrams.append(' '.join(temp))

    train_setBigrams = sorted(list(set(train_setBigrams)))
    dictTrainBigrams = dict()
    for i,movie in enumerate(train_setBigrams):
        dictTrainBigrams[movie] = i
    tfidf_vectorizerBigrams = TfidfVectorizer()
    tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams)

    return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
    MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]

    def __init__(self):
        self._vec = TfidfVectorizer(max_df=0.95, min_df=2)

    def get_feature_names(self):

        return [x + "_TFIDF" for x in self._vec.get_feature_names()]

    def get_data_array(self, df):

        return df[self.MEDICAL_KEYWORDS] \
            .apply(lambda x: " ".join(x[x == 1].index), axis=1).values

    def fit(self, df, y=None):
        data_arr = self.get_data_array(df)
        self._vec.fit(data_arr)

        return self

    def transform(self, df):
        data_arr = self.get_data_array(df)

        return self._vec.transform(data_arr).toarray()
def preprocess(word_data, targets):
    print("\n### PREPROCESSING DATA ###")

    # vectorize
    print("-- Vectorization")
    vectorizer = TfidfVectorizer(sublinear_tf=True)  # , stop_words='english'
    data_transformed = vectorizer.fit_transform(word_data)

    # feature selection
    print("-- Feature Selection")
    selector = SelectPercentile(percentile=5)
    data_selected = selector.fit_transform(data_transformed, targets)
    if data_selected.shape[1] == 0:
        data_selected = data_transformed
    else:
        print("Top {} features were selected".format(data_selected.shape[1]))

        # print top features
        nr_features = 30
        i = selector.scores_.argsort()[::-1][:nr_features]
        top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i],
                                        selector.scores_[i],
                                        selector.pvalues_[i]))
        print("\nTop %i Features:" % nr_features)
        print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n")

    features_train, features_test, labels_train, labels_test = \
        train_test_split(data_selected, targets, test_size=0.2, stratify=targets)

    return features_train, features_test, labels_train, labels_test
Esempio n. 18
0
def test_tfidfvectorizer_invalid_idf_attr():
    vect = TfidfVectorizer(use_idf=True)
    vect.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True)
    expected_idf_len = len(vect.idf_)
    invalid_idf = [1.0] * (expected_idf_len + 1)
    assert_raises(ValueError, setattr, copy, 'idf_', invalid_idf)
Esempio n. 19
0
def readFile(filename):
    
    global vectorizer
    
    train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
    train_size = train_data.shape[0]
    
    
    
    clean_train = []
    for i in xrange(0,train_size):
        clean_train.append(filter(train_data['review'][i]))
        #if i%1000 ==0:
        #    print '%d reviews processed...' %i
   
    
    #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
    if vectorizer==None:
        vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000)
        train_data_feature = vectorizer.fit_transform(clean_train)
    else:
        vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
        train_data_feature = vec.fit_transform(clean_train)
        

    print train_data_feature.shape
    if 'test' in filename:
        return train_data['id'], train_data_feature
    else:
        return train_data['id'], train_data_feature, train_data['sentiment']
Esempio n. 20
0
def get_peronalpreference_vectors(vocab, user_pref_values):
    vectorizer = TfidfVectorizer(vocabulary=vocab, lowercase=False)
    vectors = vectorizer.fit_transform(user_pref_values).toarray()
    words = vectorizer.get_feature_names()
    # idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    return words, vectors
Esempio n. 21
0
def main():
    if os.path.exists(args.out_svd_result_matrix):
        print("Loading SVD matrix from file")
        X = np.load(args.out_svd_result_matrix)
        print("Loading corpus")
        _, file_index = LoadCorpus(args.training_dir)
    else:
        print("Loading corpus")
        corpus, file_index = LoadCorpus(args.training_dir)
        print("Building TF-IDF")
        tf_idf = TfidfVectorizer(input="content", lowercase=False)
        X = tf_idf.fit_transform(corpus)
        del corpus
        print("Running LSA")
        svd = TruncatedSVD(args.dimentionality)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)
        print("Saving SVD results")
        np.save(args.out_svd_result_matrix, X)
    if (
        os.path.exists(args.out_inv_idx)
        and os.path.exists(args.out_unique_kmeans_labels)
        and os.path.exists(args.out_idx)
    ):
        print("Loading labels")
        unique_labels = np.load(args.out_unique_kmeans_labels)
        inv_idx = np.load(args.out_inv_idx)
        idx = np.load(args.out_idx)
        unique_X = X[idx]
    else:
        print("Unique matrix")
        b = np.ascontiguousarray(X).view(np.dtype((np.void, X.dtype.itemsize * X.shape[1])))
        _, idx, inv_idx = np.unique(b, return_index=True, return_inverse=True)
        print("Saving inv_idx")
        np.save(args.out_inv_idx, inv_idx)
        print("Saving idx")
        np.save(args.out_idx, idx)
        unique_X = X[idx]
        print("Running K-Means")
        unique_labels, _ = KMeans(unique_X)
        print("Save unique K-Means labels")
        np.save(args.out_unique_kmeans_labels, unique_labels)
    print("Re-label non-unique")
    labels = unique_labels[inv_idx]

    for l in range(unique_labels.max() + 1):
        out_filename = args.out_unique_distance_matrix_prefix + str(l) + ".npy"
        if os.path.exists(out_filename):
            continue
        print("Calculating distance matrix for label:", l)
        D = CalcDistances(unique_labels, l, unique_X)
        print("Saving to distance matrix to file")
        np.save(out_filename, D)

    if not os.path.exists(args.out_corpus_index):
        print("Calculating corpus index")
        corpus_index = GetCorpusIndex(file_index, labels, unique_labels, inv_idx)
        print("Saving corpus index")
        json.dump(corpus_index, open(args.out_corpus_index, "w"))
class RecommenderNB:
	min_score = None
	stop_words = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]
	
	def __init__(self, num_hashtags=40):
		RecommenderNB.min_score = float(1/(float(num_hashtags)-1.0))
		self.tl = TweetLib()
		print "Generating classifier ... "
		documents = self.tl.get_hashtag_documents(num_hashtags)
		corpus = [b for a, b in documents]
		self.hashtags = [a for a,b in documents]
		all_classes = range(len(documents))
		self.vectorizer = TfidfVectorizer(stop_words='english')
		self.xtrain = self.vectorizer.fit_transform(corpus)
		self.ytrain = all_classes
		self.parameters = {'alpha': 0.01}
		self.clf = MultinomialNB(**self.parameters).partial_fit(self.xtrain, self.ytrain, self.ytrain)
		print "Classifier has been generated..."

	def recommend(self, tweet):
		tweet = " ".join([w.lower() for w in tweet.split() if not w.lower() in RecommenderNB.stop_words])
		xtest = self.vectorizer.transform([tweet])
		pred = self.clf.predict_proba(xtest)[0]
		sorted_pred = sorted(enumerate(pred), key=lambda x:x[1])
		max_score = max([b for a,b in sorted_pred])
		if max_score < RecommenderNB.min_score:
			return None
		else:
			return list(reversed([self.hashtags[i[0]] for i in sorted_pred]))
Esempio n. 23
0
    def gen_data(self, fname):
        """
        :fname : input file, every line means a single data
        :rtype : List[List[float]]: data matrix
        """
        
        lines = [ self.langConvHandler.convert(line.strip().lower()) for line in codecs.open(fname, "rb","utf-8") if len(line) > 6]
        # lines = list(set(lines))  # remove duplicates
        
        
        logging.info("number of data %d " % len(lines))
        cut_lines = [" ".join(jieba.cut(line)) for line in lines]

        # transform to tfidfVec
        tfidfVec = TfidfVectorizer(max_features = 3000)
        tfidf_data = tfidfVec.fit_transform(cut_lines)
        tfidf_data = tfidf_data.toarray()
       
        # save origin text
        with open("./output/origin_lines.txt", "wb") as fw:
            json.dump(lines, fw)
        
        # save vectorize data
        np.save("./output/tfidf.corpus.npy", tfidf_data)
        
        self.lines = lines
        self.tfidf_data = tfidf_data
Esempio n. 24
0
  def get_top_terms(self, stops=STOPS):

    # vecotrize using only 1-grams
    vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(self.docs)

    # enumerate feature names, ie. the actual words
    self.feature_names = vectorizer.get_feature_names()

    # convert to dense array
    dense = tfidf.todense()

    # container for top terms per doc
    self.features = []

    for doc in dense:
      doc = doc.tolist()[0]

      # creates a list of tuples, (term_id, score)
      phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0]
      # feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1)
      doc_features = []

      for f_ in phrase_scores:
        fname = self.feature_names[f_[0]]
        fscore = f_[1]
        doc_features.append((fscore, fname))

      top_terms = sorted(doc_features, reverse=True) #[:n_terms]
      # top_terms = ",".join([ x[1] for x in top_terms ])
      self.features.append(top_terms)
Esempio n. 25
0
def tfidf_ize(train, test, node_info):
    vectorizer = TfidfVectorizer(ngram_range=(1,1))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    vectorizer = TfidfVectorizer(ngram_range=(2,2))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    return train, test
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
Esempio n. 27
0
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)):
    """Train the intent classifier."""
    if download:
        download_wiki()

    path = os.path.join(l.TOPDIR, 'train.json')
    training_set = json.load(open(path))
    path = os.path.join(l.TOPDIR, 'wiki.json')
    wiki_set = json.load(open(path))

    target_names = list(set([i['unit'] for i in training_set + wiki_set]))
    train_data, train_target = [], []
    for example in training_set + wiki_set:
        train_data.append(clean_text(example['text']))
        train_target.append(target_names.index(example['unit']))

    tfidf_model = TfidfVectorizer(sublinear_tf=True,
                                  ngram_range=ngram_range,
                                  stop_words='english')

    matrix = tfidf_model.fit_transform(train_data)

    if parameters is None:
        parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50,
                      'alpha': 0.00001, 'fit_intercept': True}

    clf = SGDClassifier(**parameters).fit(matrix, train_target)
    obj = {'tfidf_model': tfidf_model,
           'clf': clf,
           'target_names': target_names}
    path = os.path.join(l.TOPDIR, 'clf.pickle')
    pickle.dump(obj, open(path, 'w'))
Esempio n. 28
0
    def fit(self, docs, clean=False):
        '''
        pipeline: clean, tokenize, tfidf, nmf, kmeans
        '''

        if clean:
            print 'cleaning raw docs ......'
            clean_docs = self.clean(docs)
        else:
            clean_docs = docs

        print 'running tfidf ......'
        if 'tokenizer' not in self.kw_tfidf:
            self.tfidf = TfidfVectorizer(tokenizer=self.tokenize,
                                         **self.kw_tfidf)
        else:
            self.tfidf = TfidfVectorizer(**self.kw_tfidf)
        X = self.tfidf.fit_transform(clean_docs)

        print 'running NMF ......'
        self.nmf = NMF(**self.kw_nmf)
        H = self.nmf.fit_transform(X)
        W = self.nmf.components_

        print 'fetching top 50 words for each topic ......'
        self.top_n_words(50, W)

        return X, H, W
Esempio n. 29
0
def tfidf_covariance(texts, savepath):
    if not savepath.endswith("/"):
        savepath = savepath + "/"
    if os.path.exists(savepath + "__linkage_average.npy"):
        Z = np.load(savepath + "__linkage_average.npy")
    else:
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(input = str,
                                 strip_accents = 'ascii',
                                 analyzer ='word',
                                 max_features=5000)
        y = vectorizer.fit_transform(" ".join(text) for text in texts)
        Z = linkage(y.todense(), method='average', metric='euclidean')
        np.save(savepath + "__linkage_average.npy", Z)

    if os.path.exists(savepath + "__covariance__.npy"):
        Cov = np.load(savepath + "__covariance__.npy")
        observables = HierarchicalObservation(Cov)
    else:
        root, nodes = to_tree(Z, rd=True)
        assign_parents(root)
        adj_mat = get_adjacency_matrix(nodes)
        deg_mat = get_degree_matrix(nodes)
        sigma = 5
        laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat))
        Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)]
        np.save(savepath + "__covariance__.npy", Cov)
        observables = HierarchicalObservation(Cov)
    return observables
Esempio n. 30
0
def simple_tfidf_alldocs():
	qs = Posts.objects.all()
	docs,post_index_map = vectorize_docs(n_samples=n_samples,log_batch_size=log_batch_size, qs=qs) #Get the doc bodies
	tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features = n_features_init,ngram_range=(1,n_gram),max_df=0.8)
	tfidf_matrix_raw = tfidf_vectorizer.fit_transform(docs) #docs x n-gram-features
	tfidf_matrix_scaled = scale(tfidf_matrix_raw, with_mean = False) #Can't use sparse matrices unless with_mean=False
	return tfidf_matrix_scaled, post_index_map
Esempio n. 31
0
labels_for_removal=[k for i,k in enumerate(ngramMat.columns) if "cwjobscouk" in k] + \
       [k for i,k in enumerate(ngramMat.columns) if "such" in k] + \
       [k for i,k in enumerate(ngramMat.columns) if "please" in k] + \
       [k for i,k in enumerate(ngramMat.columns) if "job" in k] + \
       [k for i,k in enumerate(ngramMat.columns) if "london" == k] + \
       [k for i,k in enumerate(ngramMat.columns) if "be" == k] + \
       [k for i,k in enumerate(ngramMat.columns) if "is" == k] + \
       [k for i,k in enumerate(ngramMat.columns) if "are" == k] + \
       [k for i,k in enumerate(ngramMat.columns) if "more" == k]

ngramMat.drop(labels_for_removal, inplace=True, axis=1)

#%% add R and C++. python filters out single-character words and punctuation.
num_docs = ngramMat.shape[0]

ngram_vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(1, 3), min_df=1,sublinear_tf=True,lowercase=False)
tf = ngram_vectorizer.fit_transform(job_docs)
fnames = ngram_vectorizer.get_feature_names()
dense = tf.todense()

Cpp = [i for i,k in enumerate(fnames) if "C++" == k]
new=np.reshape(np.array(dense[:,Cpp]),num_docs)
ngramMat['C++'] = pd.Series(new,index=ngramMat.index)

R = [i for i,k in enumerate(fnames) if " R" == k]
new=np.reshape(np.array(dense[:,R]),num_docs)
ngramMat['R'] = pd.Series(new,index=ngramMat.index)

#%% remove duplicate docs

DM_docs = cosine_distances(ngramMat)
Esempio n. 32
0
        continue
    count+=1
    if count==185:
        break
#     post=col[5]+" "+col[6]+" "+col[7]+" "+col[8]+" "+col[9]+" "+col[10]+" "+col[11]+" "+col[12]+" "+col[13]+" "+col[14]
#     trainData.append(post)
    for i in range(10):
        trainData.append(col[i+5])
        if col[3]=='Female':
            t=str(col[4])+"0"
        else:
            t=str(col[4])+"1"
        trainTarget.append(int(t))

# Creating input feature vector using TfidfVectorizer
vectorizer=TfidfVectorizer(use_idf=True, token_pattern='[^ \n,".\':()ঃ‘?’।“”!;a-zA-Z0-9#০১২৩৪৫৬৭৮৯*&_><+=%$-`~|^·]+') #০১২৩৪৫৬৭৮৯
trainData=vectorizer.fit_transform(trainData)
features=vectorizer.get_feature_names()

# Initializing the Support Vector Machine model
model = svm.SVC(kernel='linear', C=1, gamma=1)

# Analyzing with 5 iteration
for i in range(5):
    x_train, x_test, y_train, y_test = train_test_split(trainData, trainTarget, test_size=0.3)
    # Fitting Support Vector Machine model with trainData and trainTarget
    model.fit(x_train, y_train)
    predicted2 = model.predict(x_test)
    count2 = 0
    for i in range(len(predicted2)):
        if (predicted2[i]-y_test[i])==0:
Esempio n. 33
0
#Word Cloud (not working)
wordcloud = WordCloud(max_font_size=40).generate(text)

plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

#topic modeling
#use may be final list as it is encoded
#Vectorize the text and
#Make pairwise document distance based on TF-IDF
#check unique words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
dtm = vectorizer.fit_transform(final)
#print(dtm.shape)
vocab = vectorizer.get_feature_names(
)  # list of unique vocab, we will use this later
print(len(vocab), '# of unique words')
#print vocab[-10:]
#print vocab[:10]

#NMF Decomposition using term-document matrix
from sklearn import decomposition

#print 'num of documents, num of unique words'
#print dtm.shape

num_topics = 5
Esempio n. 34
0
def create_bow_model(wdir, corpusdir, outfile, **kwargs):
    """
	Creates a bow model (a matrix of token counts) from a collection of full text files.
	
	Arguments:
	
	wdir (str): path to the working directory
	corpusdir (str): relative path to the input directory (the collection of text files)
	outfile (str): relative path to the output file (the bow matrix)
	
	optional:
	mfw (int): how many of the most frequent terms to use, if this is 0, all the terms are used 
	mode (str): should the counts be normalized? options: "count" (default), "tf-idf"
	vocab_file (bool): if True, the vocabulary of the corpus is stored as a list in a text file
	stopword_file (str): relative path to a file containing a list of stop words
	"""

    print("creating bow model...")

    mfw = kwargs.get("mfw", 0)
    mode = kwargs.get("mode", "count")
    vocab_file = kwargs.get("vocab_file", False)
    stopword_file = kwargs.get("stopword_file")

    if stopword_file:
        stopwords = pd.read_csv(join(wdir, stopword_file), header=None)
        stopwords = list(stopwords.iloc[:, 0])

    if mode == "tf-idf":
        if mfw == 0:
            if stopword_file:
                vectorizer = TfidfVectorizer(input='filename',
                                             stop_words=stopwords)
            else:
                vectorizer = TfidfVectorizer(input='filename')
        else:
            if stopword_file:
                vectorizer = TfidfVectorizer(input='filename',
                                             max_features=mfw,
                                             stop_words=stopwords)
            else:
                vectorizer = TfidfVectorizer(input='filename',
                                             max_features=mfw)
    else:
        if mfw == 0:
            if stopword_file:
                vectorizer = CountVectorizer(input='filename',
                                             stop_words=stopwords)
            else:
                vectorizer = CountVectorizer(input='filename')
        else:
            if stopword_file:
                vectorizer = CountVectorizer(input='filename',
                                             max_features=mfw,
                                             stop_words=stopwords)
            else:
                vectorizer = CountVectorizer(input='filename',
                                             max_features=mfw)

    # possible parameters and attributes for the CountVectorizer:
    # lowercase by default
    # stop_words: for a list of stop words
    # token_pattern: regex denoting what constitutes a token
    # ngram_range: tuple (min_n,max_n)
    # analyzer: word, char, char_wb
    # max_df: default 1.0, float in range 0.1.-1.0 or integer (absolute counts), ignore terms that have a document frequency higher than this
    # min_df: default 1, float or integer (absolute counts), ignore terms that have a document frequency lower than this, "cut-off"
    # max_features: only top max features ordered by term frequency across the corpus
    # vocabulary
    # attributes:
    # vocabulary_: a mapping of terms to feature indices
    # stop_words_: terms that were ignored because of max_features, max_df or min_df

    # possible parameters and attributes for the TfidfVectorizer:
    # see also above
    # use_idf: Enable inverse-document-frequency reweighting. Default: true
    # smooth_idf: Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. Default: true
    # sublinear_tf: Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
    # idf_: The inverse document frequency (IDF) vector

    filenames = sorted(glob.glob(join(wdir, corpusdir, "*.txt")))

    # bow: sparse representation
    bow = vectorizer.fit_transform(filenames)
    bow = bow.toarray()

    #print(bow.size)
    #print(bow.shape)

    vocab = vectorizer.get_feature_names()

    if (vocab_file == True):
        vocab_fr = pd.DataFrame(data=vocab)
        vocab_fr.to_csv(join(wdir, "vocab.txt"),
                        encoding="UTF-8",
                        header=False,
                        index=False)
        print("created vocabulary file...")

    #print(vocab[:100])
    #exit()
    #print(vocab[:100])

    # save to file
    idnos = [re.split(r"\.", re.split(r"/", f)[-1])[0] for f in filenames]

    bow_frame = pd.DataFrame(columns=vocab, index=idnos, data=bow)
    bow_frame.to_csv(join(wdir, outfile), sep=",", encoding="utf-8")

    print("Done! Number of documents and vocabulary: ", bow.shape)
    print("Number of tokens: ", bow.sum())
Esempio n. 35
0
    ]
    # 去掉停用词
    contents = " ".join([txt for txt in contents if txt not in stopwords])
    print(contents)
    return contents


def main():
    path = '258'
    txt_processing(path)


if __name__ == '__main__':
    main()

tfidf = TfidfVectorizer()
x_train, x_test, y_train, y_test = train_test_split(contents,
                                                    class_list,
                                                    test_size=0.2)

# # 保存数组
# np.save('conttnes.npy',contents)
# np.save('class_list.npy',class_list)

X_train = tfidf.fit_transform(x_train)
X_test = tfidf.transform(x_test)

# 贝叶斯模型
mulp = MultinomialNB()
mulp_NB = mulp.fit(X_train, y_train)
Esempio n. 36
0
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm

column = "word_seg"
train = pd.read_csv('../../data/raw_data/train_set.csv')
test = pd.read_csv('../../data/raw_data/test_set.csv')
test_id = test["id"].copy()
vec = TfidfVectorizer(ngram_range=(1, 2),
                      min_df=3,
                      max_df=0.9,
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1)
trn_term_doc = vec.fit_transform(train[column])
test_term_doc = vec.transform(test[column])
fid0 = open('baseline.csv', 'w')

y = (train["class"] - 1).astype(int)
lin_clf = svm.LinearSVC()
lin_clf.fit(trn_term_doc, y)
preds = lin_clf.predict(test_term_doc)
i = 0
fid0.write("id,class" + "\n")
for item in preds:
    fid0.write(str(i) + "," + str(item + 1) + "\n")
    i = i + 1
fid0.close()
Esempio n. 37
0
            ### append the text to word_data
            word_data.append(parsed_email_stripped_common_words)

            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == 'sara':
                from_data.append(0)
            elif name == 'chris':
                from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )

print word_data[152]



### in Part 4, do TfIdf vectorization here

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = "english")
vectorizer.fit_transform(word_data)
vocab_list = vectorizer.get_feature_names()
print "num different words:", len(vocab_list)
print vocab_list[34597]
Esempio n. 38
0
Democrats pounced on President Trump’s criticism of Robart, with Democratic senators flatly saying the President’s comments will factor into the confirmation hearings for Supreme Court nominee Neil Gorsuch.

“Attack on federal judge from POTUS is beneath the dignity of that office. That attitude can lead America to calamity,” Washington Gov. Jay Inslee tweeted Saturday.

Attack on federal judge from POTUS is beneath the dignity of that office. That attitude can lead America to calamity.

— Governor Jay Inslee (@GovInslee) February 4, 2017

“The President’s attack on Judge James Robart, a Bush appointee who passed with 99 votes, shows a disdain for an independent judiciary that doesn’t always bend to his wishes and a continued lack of respect for the Constitution, making it more important that the Supreme Court serve as an independent check on the administration,” Senate Minority Leader Chuck Schumer said in a statement.

“With each action testing the Constitution, and each personal attack on a judge, President Trump raises the bar even higher for Judge Gorsuch’s nomination to serve on the Supreme Court. His ability to be an independent check will be front and center throughout the confirmation process.”

Vermont. Sen. Patrick Leahy, the ranking member of the Judiciary Committee, said President Trump’s “hostility toward the rule of law is not just embarrassing, it is dangerous.”

“We need a nominee for the Supreme Court willing to demonstrate he or she will not cower to an overreaching executive. This makes it even more important that Judge Gorsuch, and every other judge this president may nominate, demonstrates the ability to be an independent check and balance on an administration that shamefully and harmfully seems to reject the very concept.”

Robart’s order on Friday was a significant setback to President Trump’s ban and set up the nation for a second straight weekend of confusion about the policy’s legality.

The White House said Friday the Department of Justice will challenge the decision. In a statement, White House press secretary Sean Spicer initially called Robart’s order “outrageous” before quickly issuing another statement that dropped that word.

Robart has presided in the US District Court for the Western District of Washington state since 2004. He assumed senior status in 2016.

"""

documents = [news1, news2]

tfidf = TfidfVectorizer().fit_transform(documents)

pairwise_sim = tfidf * tfidf.T

print(pairwise_sim.A)
Esempio n. 39
0
documents = full_text.split()

# check number of documentions
print(len(documents))
print('\n')

# preprocess documents (remove special characters and lowercase everything)
for i in range(len(documents)):
	documents[i] = " ".join(documents[i].split())
	documents[i] = documents[i].replace(r"\[.*\]","")
	documents[i] = re.sub(r'([^\s\w]|_)+', '', documents[i])
	documents[i] = documents[i].lower()

# use tfidf to create word vector
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# try different k values
k = 7
model = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=1)
model.fit(X)
 
# print top terms per cluster (code snippet taken from python documentation online)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :50]:
        print(' %s' % terms[ind]),
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc

# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

# Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']

topic_encoded_df.describe()

# display(topic_encoded_df[["documents", "topic_1", "topic_2"]])
Esempio n. 41
0
#
# https://www.cnblogs.com/pinard
#
# Permission given to modify the code as long as you keep this declaration at the top
#
# 文本挖掘预处理之TF-IDF https://www.cnblogs.com/pinard/p/6693230.html

# In[2]:

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I come to China to travel", "This is a car polupar in China",
    "I love tea and Apple ", "The work is to write some papers in science"
]

vectorizer = CountVectorizer()

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
print(tfidf)

# In[4]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(corpus)
print(re)
Esempio n. 42
0
    print("%d categories" % len(categories))
    print()

    # split a training set and a test set
    y_train = data_train.target
    y_test = data_test.target

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    t0 = time()
    vectorizer = TfidfVectorizer(encoding=ENCODING,
                                 use_idf=True,
                                 norm='l2',
                                 binary=False,
                                 sublinear_tf=True,
                                 min_df=0.001,
                                 max_df=1.0,
                                 ngram_range=(1, 2),
                                 analyzer='word',
                                 stop_words=None)

    # the output of the fit_transform (x_train) is a sparse csc matrix.
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" %
          (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print(
        "Extracting features from the test dataset using the same vectorizer")
Esempio n. 43
0
def tf_idf_classify(
    input_df,
    feature_column,
    label_column,
    test_size=.2,
    return_fscore=True,
    show_info=True,
):
    """

    :param input_df:
    :param feature_column:
    :param label_column:
    :param process_text:
    :param test_size:
    :param random_state:
    :param classifier: RF,DT,LSVC, LR,NB,KNN
    :param kwargs:
    :return:
    """
    df = input_df.copy()

    # nltk.download('stopwords')
    # nltk.download('wordnet')

    # stopword removal and lemmatization
    stopwords = nltk.corpus.stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    msk = np.random.rand(len(df)) < 1 - test_size
    train_df = df[msk]
    val_df = df[~msk]

    train_X = []
    test_X = []

    train_y = train_df[label_column].tolist()
    test_y = val_df[label_column].tolist()

    labels = list(df[label_column])
    labels = [str(l) for l in labels]

    # text pre processing
    for text in tqdm(train_df[feature_column]):
        review = re.sub('[^a-zA-Z]', ' ', text)
        review = review.lower()
        review = review.split()
        review = [
            lemmatizer.lemmatize(word) for word in review
            if not word in set(stopwords)
        ]
        review = ' '.join(review)
        train_X.append(review)

    # text pre processing
    for text in tqdm(val_df[feature_column]):
        review = re.sub('[^a-zA-Z]', ' ', text)
        review = review.lower()
        review = review.split()
        review = [
            lemmatizer.lemmatize(word) for word in review
            if not word in set(stopwords)
        ]
        review = ' '.join(review)
        test_X.append(review)

    # tf idf
    tf_idf = TfidfVectorizer()
    # applying tf idf to training data
    X_train_tf = tf_idf.fit_transform(train_X)
    # applying tf idf to training data
    X_train_tf = tf_idf.transform(train_X)

    # transforming test data into tf-idf matrix
    X_test_tf = tf_idf.transform(test_X)

    # naive bayes classifier
    naive_bayes_classifier = MultinomialNB()
    naive_bayes_classifier.fit(X_train_tf, train_y)

    # predicted y
    y_pred = naive_bayes_classifier.predict(X_test_tf)

    f, cf = single_label_f_score(y_gold=test_y, y_pred=y_pred)

    if show_info:
        print('f-score:', f)
        print('label wise f-score', cf)
        conf_mat = confusion_matrix(test_y, y_pred)
        fig, ax = plt.subplots(figsize=(4, 4))
        labels = list(set(labels))
        sns.heatmap(conf_mat,
                    annot=True,
                    cmap="Blues",
                    fmt='d',
                    xticklabels=labels,
                    yticklabels=labels)
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.title("TFIDF CONFUSION MATRIX", size=16)
    if return_fscore:
        return f, cf
Esempio n. 44
0
class Reader:
    dir = os.getcwd()  # Gets the current working directory

    words_of_tweets = [
    ]  # Saves all the tweet cleared from stop-words, stemmed and tokenized

    called_once = False  # Indicates if the GloVe model has been trained (read) or not

    onehot_encoder = CountVectorizer()

    scaler = MinMaxScaler(feature_range=(0, 1))

    tester = MinMaxScaler(feature_range=(0, 1))

    def dummy_fun(self, doc):
        return doc

    vectorizer = TfidfVectorizer(lowercase=False,
                                 analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun)

    # min_df : float in range [0.0, 1.0] or int, default=1
    # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents,
    # integer absolute counts. This parameter is ignored if vocabulary is not None.
    vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)

    # sg: CBOW if 0, skip-gram if 1
    # ‘min_count’ is for neglecting infrequent words.
    # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    model = Word2Vec()

    # dm: DBOW if 0, distributed-memory if 1
    # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
    modeldoc = Doc2Vec()

    # GloVe model
    glove_model = {}

    # Feature Selection

    # Univariate_Selection
    test = SelectKBest(score_func=chi2, k=100)

    # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination
    rfe = RFE(model, 100)

    # Feature Extraction with PCA
    pca = PCA(n_components=100)

    # Feature Extraction with TruncatedSVD
    svd = TruncatedSVD(n_components=100)

    # Feature Importance with Extra Trees Classifier
    sfm = RandomForestClassifier()
    models = SelectFromModel(sfm)

    train_A = None
    train_A_emoji = None
    train_A_emoji_hash = None
    train_B = None
    train_B_emoji = None
    train_B_emoji_hash = None

    input_A = None
    input_A_emoji = None
    input_B = None
    input_B_emoji = None

    ##############################################################################################################################################################

    # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders

    ##############################################################################################################################################################

    def tokenize(self, text):
        # Tokenize tweets
        words = word_tokenize(text)

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        words = [w.translate(table) for w in words]

        # remove all tokens that are not alphabetic
        words = [word for word in words if word.isalpha()]

        # Delete Stop-Words
        whitelist = ["n't", "not"]  # Keep the words "n't" and "not"
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words or w in whitelist]
        stopwords_wordcloud = set(STOPWORDS)
        words = [
            w for w in words if w not in stopwords_wordcloud or w in whitelist
        ]

        return words

    # Print the counts of the top 85 most used words and print a graph with the words of the data set
    def wordcloud(self):
        stopwords_wordcloud = set(STOPWORDS)

        # Print the counts of the top 85 most used words in tweets

        vectorizer = CountVectorizer(analyzer='word',
                                     tokenizer=self.tokenize,
                                     lowercase=True,
                                     stop_words=stopwords_wordcloud,
                                     max_features=85)

        corpus_words = vectorizer.fit_transform(self.train_A['tweet'])
        corpus_words = corpus_words.toarray()
        vocab = vectorizer.get_feature_names()

        # Sum up the counts of each vocabulary word
        dist = np.sum(corpus_words, axis=0)

        # For each, print the vocabulary word and the number of times it
        # appears in the data set
        for tag, count in zip(vocab, dist):
            print(count, ' ', tag)

        # Print a scheme with most used words that are not stopwords
        wordcloud = WordCloud(background_color="black",
                              stopwords=stopwords_wordcloud,
                              random_state=500,
                              relative_scaling=1.0,
                              colormap='summer').generate(" ".join(
                                  [i for i in self.train_A['tweet']]))
        plt.figure(facecolor='k')
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.title("Most used words in tweets")
        plt.show()

    ##############################################################################################################################################################

    # Pre-processing of the tweets
    def pre_processing(self):
        # Feature Extraction
        data = Feature_Extraction.TwitterData_ExtraFeatures()
        data.build_features(self.train_A)
        self.extra_features = data.processed_data

        # Clearing training dataset and Integer Encoding

        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            'http\S+|www.\S+', '', case=False)  # Delete URLs
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            r'@\S+', '', case=False)  # Delete Usernames
        self.train_A['tweet'] = self.train_A['tweet'].str.replace(
            r'#', ' ', case=False
        )  # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags

        #        print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet]))

        for i in range(0, len(self.train_A)):
            # Tokenize tweets
            words = word_tokenize(self.train_A.iloc[i][2])

            # remove punctuation from each word
            table = str.maketrans('', '', string.punctuation)
            words = [w.translate(table) for w in words]

            # remove all tokens that are not alphabetic
            words = [word for word in words if word.isalpha()]

            # stemming of words
            porter = PorterStemmer()
            words = [porter.stem(word) for word in words]

            # Delete Stop-Words
            whitelist = ["n't", "not", 'nor', "nt"
                         ]  # Keep the words "n't" and "not", 'nor' and "nt"
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if w not in stop_words or w in whitelist]

            # Keep the tokenized tweets
            self.words_of_tweets.append(words)

        # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Select the proper encoding and Feature Selection
    # x_enc: training data set or test data set
    # train_test: whether x_enc is training set or test set
    # y: the irony labels of either the training set or the test set
    # dataset_index: the indexes of train set or test set
    # extra_features: Added features from feature extraction
    # feature_selection: number that indicates what feature selection algorithm will be used
    # encoding: number that indicates what encoding algorithm will be used
    # print_file: the file name that the print will be written
    def get_enc(self, x_enc, train_test, y, dataset_index, extra_features,
                feature_selection, encoding, print_file):
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Encodings
        encoded_tweets = []

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # TF-IDF
        if encoding == 1:
            encoded_tweets = self.tf_idf(x_enc, train_test).toarray(
            )  # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # One hot encoding
        if encoding == 2:
            encoded_tweets = self.one_hot_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Bi-grams
        if encoding == 3:
            encoded_tweets = self.bigrams_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Word2Vec
        if encoding == 4:
            encoded_tweets = self.Word2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Doc2Vec
        if encoding == 5:
            encoded_tweets = self.Doc2Vec_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # GloVe
        if encoding == 6:
            encoded_tweets = self.GloVe_enc(x_enc, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Feature Selection

        # Format the features from Feature Extraction
        extra_features = zip(
            *extra_features
        )  # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features
        extra_features = list(extra_features)
        extra_features = np.array(extra_features)
        extra_features = extra_features[dataset_index]
        print("features chosen shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features chosen shape: " +
                         str(extra_features.shape) + '\n')

        # Normalize each of the columns of the added features form Feature Selection

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features before normalization: " +
                         str(extra_features) + '\n')

        if train_test == 1:  # Train set
            # train the normalization
            self.scaler = MinMaxScaler(feature_range=(0, 1))
            self.scaler = self.scaler.fit(extra_features)
            # normalize the train dataset
            extra_features = self.scaler.transform(extra_features)

        if train_test == 0:  # Test set
            # normalize the test dataset
            extra_features = self.scaler.transform(extra_features)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("features after normalization: " +
                         str(extra_features) + '\n')

        # Adding features to encoded_tweets
        print("encoded_tweets before tweets shape: ", encoded_tweets.shape)
        print("before tweets extra_features shape: ", extra_features.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("encoded_tweets before tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "before tweets extra_features shape: " +
                         str(extra_features.shape) + '\n' +
                         "before encoded_tweets: " + str(encoded_tweets) +
                         '\n')

        encoded_tweets = numpy.concatenate((encoded_tweets, extra_features),
                                           axis=1)
        encoded_tweets = np.array(encoded_tweets)
        print("final encoded_tweets shape: ", encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write("final encoded_tweets shape: " +
                         str(encoded_tweets.shape) + '\n' +
                         "final encoded_tweets: " + str(encoded_tweets) + '\n')

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Univariate Selection

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 7:
            encoded_tweets = self.Univariate_Selection(encoded_tweets, y,
                                                       train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Recursive Feature Elimination

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 8:
            encoded_tweets = self.Recursive_Feature_Elimination(
                encoded_tweets, y, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Principal Component Analysis

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 9:
            encoded_tweets = self.Principal_Component_Analysis(
                encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Truncated SVD (alternative of PCA for TF-IDF)

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 10:
            encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        # Feature Importance

        #  One-hot-encoding, TF-IDF, Bigrams
        if feature_selection == 11:
            encoded_tweets = self.Feature_Importance(encoded_tweets, y,
                                                     train_test)

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        print("Final encoded_tweets, after feature selection, shape: ",
              encoded_tweets.shape)

        with open(print_file,
                  "a") as myfile:  # Write above print into output file
            myfile.write(
                "Final encoded_tweets, after feature selection, shape: " +
                str(encoded_tweets.shape) + '\n')

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Create a dictionary for one hot encoding and encode with one hot encoding
    def one_hot_enc(self, x_enc, train_test):
        encoded_tweets = []
        x_enc = list(x_enc)

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in x_enc:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()
            print(np.array(vocab).shape)

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in x_enc:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # TF-IDF
    def tf_idf(self, x_enc, train_test):
        encoded_tweets = []
        if (train_test == 1):  # train
            self.vectorizer = TfidfVectorizer(lowercase=False,
                                              analyzer='word',
                                              tokenizer=self.dummy_fun,
                                              preprocessor=self.dummy_fun)
            encoded_tweets = self.vectorizer.fit_transform(x_enc)
        if (train_test == 0):  # test
            encoded_tweets = self.vectorizer.transform(x_enc)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def bigrams_enc(self, x_enc, train_test):
        bigrams = []  # Bi-grams of all tweets

        # Use the pre-processing done above
        for y in range(0, len(x_enc)):
            bigrams.append(list(ngrams(x_enc[y], 2)))

        encoded_tweets = []

        if train_test == 1:  # Train set
            self.onehot_encoder = CountVectorizer(analyzer='word',
                                                  tokenizer=self.dummy_fun,
                                                  lowercase=False,
                                                  binary=True)

            xenc = []
            for x in bigrams:
                xenc.append(x)

            encoded_tweets = self.onehot_encoder.fit_transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        if train_test == 0:  # Test set
            xenc = []
            for x in bigrams:
                xenc.append(x)
            encoded_tweets = self.onehot_encoder.transform(xenc)
            encoded_tweets = encoded_tweets.toarray()
            vocab = self.onehot_encoder.get_feature_names()

            for i in range(0, len(encoded_tweets[0])):
                if encoded_tweets[0][i] == 1:
                    print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Word2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # sg: CBOW if 0, skip-gram if 1
            # ‘min_count’ is for neglecting infrequent words.
            # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.model = Word2Vec(size=vector_size, min_count=0, sg=1)
            self.model.build_vocab([x.words for x in encoded_tweets])
            self.model.train([x.words for x in encoded_tweets],
                             total_examples=len(encoded_tweets),
                             epochs=10)

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)
        print(encoded_tweets)

        return encoded_tweets

    # Used for computing the mean of word2vec and implementing the transform function
    def buildWordVector(self, model, tweet, size, tfidf):
        vec = np.zeros(size).reshape((1, size))
        count = 0.
        for word in tweet:
            try:
                vec += model[word].reshape((1, size)) * tfidf[word]
                count += 1.
            except KeyError:  # handling the case where the token is not
                # in the corpus. useful for testing.
                continue
        if count != 0:
            vec /= count
        return vec

    def labelizeTweets(self, tweets, label_type):
        LabeledSentence = gensim.models.doc2vec.LabeledSentence

        labelized = []
        for i, v in enumerate(tweets):
            label = '%s_%s' % (label_type, i)
            labelized.append(LabeledSentence(v, [label]))
        return labelized

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Doc2Vec_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN')

        vector_size = 100

        if train_test == 1:  # Train set
            # dm: DBOW if 0, distributed-memory if 1
            # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)
            self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0)

            self.modeldoc.build_vocab([x for x in encoded_tweets])
            self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]),
                                total_examples=len(encoded_tweets),
                                epochs=10)

            # Get the vectors created for each tweet
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                prefix_train_pos = 'TRAIN_' + str(i)
                encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos]

        if train_test == 0:  # Test set
            encoded_tweets = np.zeros((len(x_enc), vector_size))
            for i in range(0, len(x_enc)):
                encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i])

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    def GloVe_enc(self, x_enc, train_test):
        encoded_tweets = self.labelizeTweets(
            x_enc, 'TRAIN'
        )  # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams)

        if train_test == 1:  # Train set
            if not self.called_once:  # Used to ensure that training-reading the GloVe model is done just once
                self.called_once = True
                gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt'
                print("Loading Glove Model")
                f = open(gloveFile, 'r', encoding="utf8")
                self.glove_model = {}
                for line in f:
                    splitLine = line.split()
                    word = splitLine[0]
                    embedding = np.array([float(val) for val in splitLine[1:]])
                    self.glove_model[word] = embedding

            self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7)
            self.vectorizer1.fit_transform([x.words for x in encoded_tweets])

        if train_test == 0:  # Data set
            self.vectorizer1.transform([x.words for x in encoded_tweets])

        tfidf = dict(
            zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_))
        vector_size = 200  # Dimensions of vectors are stated at the name of the GloVe txt files
        train_vecs_w2v = np.concatenate([
            self.buildWordVector(self.glove_model, tweet, vector_size, tfidf)
            for tweet in map(lambda x: x.words, encoded_tweets)
        ])
        encoded_tweets = scale(train_vecs_w2v)

        return encoded_tweets

    ###############################################################################################################################################
    ###############################################################################################################################################

    # Feature Selection

    ###############################################################################################################################################
    ###############################################################################################################################################

    def Univariate_Selection(self, x, y, train_test):
        # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.test = SelectKBest(score_func=chi2, k=100)
            features = self.test.fit_transform(x, y)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        if train_test == 0:  # Test set
            features = self.test.transform(x)
            # summarize scores
            numpy.set_printoptions(
                precision=3)  # Format print to show only 3 decimals of floats

        return features

    def Recursive_Feature_Elimination(self, x, y, train_test):
        # Feature Extraction with RFE
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            model = RandomForestClassifier(n_estimators=250,
                                           max_features=7,
                                           max_depth=30,
                                           min_samples_split=2,
                                           random_state=0,
                                           n_jobs=-1)
            self.rfe = RFE(model, 100)
            features = self.rfe.fit_transform(x, y)

        if train_test == 0:  # Test set
            features = self.rfe.transform(x)

        return features

    def Principal_Component_Analysis(self, x, train_test):
        # Feature Extraction with PCA
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.pca = PCA(n_components=100)
            features = self.pca.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.pca.transform(x)

        return features

    def TruncatedSVD(self, x, train_test):
        # Feature Extraction with TruncatedSVD
        features = []

        if train_test == 1:  # Train set
            # feature extraction
            self.svd = TruncatedSVD(n_components=100)
            features = self.svd.fit_transform(x)

        if train_test == 0:  # Test set
            features = self.svd.transform(x)

        return features

    def Feature_Importance(self, x, y, train_test):
        # Feature Importance with Extra Trees Classifier
        features = []

        if train_test == 1:  # Train set
            # feature extraction

            # Create a random forest classifier with the following Parameters
            self.sfm = RandomForestClassifier(n_estimators=250,
                                              max_features=7,
                                              max_depth=30)

            self.sfm.fit(x, y)

            # Select features which have higher contribution in the final prediction
            self.models = SelectFromModel(self.sfm, threshold="9*mean")
            self.models.fit(x, y)
            features = self.models.transform(x)

        if train_test == 0:  # Test set
            features = self.models.transform(x)

        return features

    ###############################################################################################################################################
    ###############################################################################################################################################

    ##############################################################################################################################################################

    # Read the training files for task (with emojis)

    # train_A

    ##############################################################################################################################################################

    def readTrain(self):
        # Read the training file for task A with emojis

        train_file_A = self.dir + '\\dataset\\train\\SemEval2018-T3-train-taskA_emoji.txt'

        data_fields = ['id', 'label',
                       'tweet']  # Define the names of the columns
        self.train_A = pd.read_csv(
            train_file_A, sep='\t', header=None, names=data_fields, quoting=3
        )  # quoting=3 tells Python to ignore doubled quotes, header=None defines that the  first line of the file is not the names of the columnsv

        # ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        # Pre-processing
        self.pre_processing()

# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##############################################################################################################################################################

# Check if the dataset is imbalanced

##############################################################################################################################################################

    def checkImbalance(self):
        # Checking if file A with emojis is imbalanced

        counter0 = 0
        counter1 = 0
        counter_all = 0
        for i in range(0, len(self.train_A)):
            counter_all += 1
            if (self.train_A.iloc[i][1] == 1):
                counter0 += 1
            else:
                counter1 += 1
        print(
            'File A without emojis -> Percentage of tweets classified as 0: ' +
            str((counter0 / counter_all) * 100))
        print(
            'File A without emojis -> Percentage of tweets classified as 1: ' +
            str((counter1 / counter_all) * 100) +
            '\n ----------------------------------------')
    #                [0]      [1]      [2]       [3]           [4]              [5]
    print "**** ", "['ID',   'Year', 'Title', 'Authors', 'Journal name (O)', 'Abstract']"
    for i in range(5):
        print "     ", node_info[i]

IDs = [element[0] for element in node_info]  # this holds a vertical list of only the IDs

# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]  # this holds a vertical list of the Abstracts

# vectorizer initializes the TfidfVectorizer() & we can pass more parameters. see webpage in top
# stop_words="english": remove common 'english' words
# min_df=0:
# ngram_range=(1,3): generate 2 and 3 word phrases along with the single words from the corpus
# analyzer='word':
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words="english")

# each row is a node in the order of node_info
# fit_transform(): Learn vocabulary and idf, return term-document matrix.
features_TFIDF_Abstract = vectorizer.fit_transform(corpus)
# print type(features_TFIDF) | will print <class 'scipy.sparse.csr.csr_matrix'>
# https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.sparse.csr_matrix.html

# compute TFIDF vector of each title
corpusTitle = [element[2] for element in node_info]
# each row is a node in the order of node_info
features_TFIDF_Title = vectorizer.fit_transform(corpusTitle)

# compute TFIDF vector of each author
corpusAuthor = [element[3] for element in node_info]
# each row is a node in the order of node_info
def ngrams(text, n):
    return zip(*[normalize(text).split()[i:] for i in range(n)])

nltk.download('punkt') # if necessary...

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(MLStripper.strip_tags(text).lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

print(cosine_sim('a little bird', 'a little bird'))
print(cosine_sim('a little bird', 'a little bird chirps'))
print(cosine_sim('a little bird', 'a big dog barks'))


# print(strip_tags("""<p>Deep clone an {@code Object} using serialization.</p>
#
#  <p>This is many times slower than writing clone methods by hand
#  on all objects in your object graph. However, for complex object
#  graphs, or for those that don't support deep cloning this can
Esempio n. 47
0
def update_figure(n_clicks, episode_slider, word_input, speaker_input):

    fullmerge = {}
    data_subset = data[data['episode'] >= episode_slider[0]][data['episode'] <= episode_slider[1]]
    #Merge all episodes into one long string
    peeps = list(filter(None,speaker_input.upper().replace(" ","").split(",")))
    for peep in peeps:
        fullmerge.update({peep:' '.join(data_subset[data_subset.Speaker == peep]['cleaned'])})
    finaldata = pd.DataFrame.from_dict(fullmerge, orient = 'index').reset_index().rename(columns = {'index':'speaker',0:'text'})


    totalwords = pd.DataFrame()
    for peep in peeps:
        totalwords = totalwords.append(pd.Series([peep, ' ', finaldata[finaldata['speaker'] == peep].text.str.count(' ').iloc[0]]), ignore_index = True)
    totalwords = totalwords.rename(index=str, columns={0: "speaker", 1: "word",2: "total"})




    words = list(filter(None,word_input.lower().split(",")))
    #Calculate frequency of words in finaldata
    df = pd.DataFrame()
    for peep in peeps:
        for word in words:
            df = df.append(pd.Series([peep, word, finaldata[finaldata['speaker'] == peep].text.str.count(word).iloc[0]]), ignore_index = True)
    df = df.rename(index=str, columns={0: "speaker", 1: "word",2: "amount"})


    #Calculate rate per 1000 words
    df = pd.merge(df, totalwords[['speaker','total']], on='speaker')
    df['Number of times said per 1000 words'] = (df['amount']/df['total'])*1000

    #Sort data by rate and then speaker
    df = df.sort_values(by=['Number of times said per 1000 words','speaker'], ascending = [True,False])
    

    #Graph
    if len(words) != 1:
        if len(peeps) != 1:
            #Apply the tfidf function, and find the most "distinguishing" word among the given words and speakers
            tfidf = TfidfVectorizer(stop_words='english', vocabulary = words)
            tfs = tfidf.fit_transform(finaldata['text'])
            matrix = pd.DataFrame(tfs.todense(), index = peeps, columns = tfidf.get_feature_names()).transpose()
            matrix['word'] = matrix.index
            matrix = pd.melt(matrix, id_vars = 'word')
            matrix = matrix.rename(index=str, columns={'value': "tfidf",'variable': "speaker"})
            distWord = matrix.loc[matrix['tfidf'].idxmax()]['word']
            distSpeaker = matrix.loc[matrix['tfidf'].idxmax()]['speaker']
            tfidfSent = ("Most distinguishing: '" + distWord + "' by " + distSpeaker + ".*")

            fig = ff.create_facet_grid(
                df,
                x='Number of times said per 1000 words',
                y='word',
                facet_col='speaker',
                color_name='speaker',
                trace_type='bar',
                orientation = 'h',
                scales = 'free',
                width = 1200
            )
            for i in range(len(peeps)+1):
                if i == 0:
                    fig.layout.xaxis.update({'range': [df['Number of times said per 1000 words'].min(), (df['Number of times said per 1000 words'].max()+(.15 * df['Number of times said per 1000 words'].max()))]})
                else:
                    exec('fig.layout.xaxis' + str(i)+".update({'range': [df['Number of times said per 1000 words'].min(), (df['Number of times said per 1000 words'].max()+(.15 * df['Number of times said per 1000 words'].max()))]})")
            fig.layout.xaxis.title = tfidfSent
            fig.layout.update(plot_bgcolor='rgba(230,230,230,90)')

        elif len(peeps) == 1: 
            fig = ff.create_facet_grid(
            df,
            x='word',
            y='Number of times said per 1000 words',
            color_name='word',
            trace_type='bar',
            scales = 'free',
            width = 1200
            )
            fig.layout.update(plot_bgcolor='rgba(230,230,230,90)')
    elif len(words) == 1:
        fig = ff.create_facet_grid(
            df,
            x='speaker',
            y='Number of times said per 1000 words',
            color_name='speaker',
            trace_type='bar',
            scales = 'free',
            width = 1200
        )
        fig.layout.update(plot_bgcolor='rgba(230,230,230,90)')
    return {
    'data': fig

    }
Esempio n. 48
0
##label vector
labels=dat.label
labels.head()

####Create training and test data sets
x_train,x_test,y_train,y_test=train_test_split(dat['text'].apply(lambda x: np.str_(x)), labels, test_size=0.2, random_state=7)

##Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

##Fit and train count vectorizer
count_train = count_vectorizer.fit_transform(x_train)
count_test = count_vectorizer.transform(x_test)

##TFIDF Vectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

####Fit to tfidf data and transform test and training sets to normalized tfidf vector
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

####Hashing Vectorizer
hash_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)

###Fit hashing vectorizer to data and transform both test and training sets
hash_train = hash_vectorizer.fit_transform(x_train)
hash_test = hash_vectorizer.transform(x_test)


###Feature names
#print(tfidf_vectorizer.get_feature_names()[-10:])
Esempio n. 49
0
query = "permanently get rid jock itch"
# print(questions)

simi = -1
matched_ques = []
for strr1 in questions:
    xx1 = lemmatized_string(strr1)
    xx2 = lemmatized_string(query)
    simi1 = jaccard_similarity(xx1, xx2)
    if (simi1 > 0.20):
        matched_ques.append((simi1, strr1))
    # if(simi1>simi):
    # 	simi=simi1
    # 	matched_ques=strr1

tfidf_vectorizer = TfidfVectorizer()
query1 = (" ").join(x for x in lemmatized_string(query))

documents = []
documents.append(query1)

for i in questions:
    i1 = (" ").join(x for x in lemmatized_string(i))
    documents.append(i1)
documents = tuple(documents)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

xxx = []
xxx.append((query, 1.0))
for x in np.where(cos_sim[0] >= 0.4)[0]:
print(string)

#%%

#Count Vector
count_vect = CountVectorizer()
count = count_vect.fit_transform(string)
tf_feature_names = count_vect.get_feature_names()

print(count)

#%%
string = map(' '.join, sw)

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(string)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

#%%
no_topics = 29

# Coba algoritma NMF
nmf = NMF(n_components=no_topics,
          random_state=1,
          alpha=.1,
          l1_ratio=.5,
          init='nndsvd').fit(tfidf)

# Coba algoritma LDA
lda = LatentDirichletAllocation(n_topics=no_topics,
Esempio n. 51
0
print(df)

# %% [markdown]
# Create the feature extraction object

# %% codecell
vocab_size = 1000
embedding_dim = 64
# Set up some sklearn objects that are going to be in the pipeline
# SMOTE for class balancing via oversampling the minority class
smt = SMOTE(random_state=12)
# TF-IDF Vectorizer: https://www.quora.com/How-does-TfidfVectorizer-work-in-laymans-terms?share=1
# Define the importance of words in the corpus depending on frequency AND "uniqueness"
tfidf = TfidfVectorizer(sublinear_tf=True,
                        max_features=vocab_size,
                        min_df=5, norm='l2',
                        encoding='latin-1',
                        ngram_range=(1, 1),
                        stop_words='english')

# %% [markdown]
# Separate the data in train validate and test

# %% codecell
train, validate, test = np.split(df.sample(frac=1), [int(.6 * len(df)), int(.8 * len(df))])

features = tfidf.fit_transform(train.text).toarray()
# Binary classification result
labels = train.is_asshole
X_train, y_train = smt.fit_resample(features, labels)
# TF-IDF on test set
X_validate = tfidf.transform(validate.text).toarray()
Esempio n. 52
0
        processed_feature = re.sub(r'\s+[a-zA-z]\s+', ' ', processed_feature)
        processed_feature = re.sub(r'\^[a-zA-z]\s+', ' ', processed_feature)
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
        processed_feature = re.sub(r'^b\s+', '', processed_feature)
        processed_feature = processed_feature.lower()
        processed_features.append(processed_feature)
    return processed_features


processed_features = process(features)

# print(processed_features[:10])

# Tfidf vectorizer
vectorizer = TfidfVectorizer(max_features=2500,
                             min_df=2,
                             max_df=0.8,
                             stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

# split into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(processed_features,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=0)

# Naive Bayes' classifier
model = GaussianNB()
model.fit(X_train, y_train)

# Test predictions
Esempio n. 53
0
                (usage[2]*resource.getpagesize())/1000000.0 )
u_start = using('start')

data = raw_input("folder containing the folders. Don't add / in the end.\n")
close_folder = raw_input("folder to dump output files. Don't add / in the end.\n")
categories = [f for f in listdir(data) if not isfile(join(data, f))]
#print categories
#categories = ["business" ,"sport", 'entertainment', 'tech', 'politics']

dataset = sklearn.datasets.load_files(	data + '/',	description=None, categories=categories ,
										load_content=True, shuffle=True, 
										encoding = 'utf-8',decode_error='ignore',
										random_state=179863)
true_k = len(categories) 	#no of groups
u_dataRead = using()
vectorizer = TfidfVectorizer(max_df=.5, max_features=310 ,min_df=10 ,
										stop_words='english',use_idf=True)				
dat = vectorizer.fit_transform(dataset.data)
dat = dat.toarray()
u_Vect = using()

factor = int(len(dataset.data) / 5)

X = dat[:factor]
Y = dat[factor:]

km = KMeans(n_clusters=true_k)
km.fit(X)

cur = Y
clusters = defaultdict(list)
vishal = defaultdict(list)
Esempio n. 54
0
    return result.strip()


### set X_text and y ###

X_text = []
y = []

for intent, intent_data in BOT_CONFIG['intents'].items():
    for example in intent_data['examples']:
        X_text.append(example)
        y.append(intent)

### Веторизация ###

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
X = vectorizer.fit_transform(X_text)
vectorizer.get_feature_names()

### Классификация ###

clf = LinearSVC(random_state=0)
clf.fit(X, y)


def get_failure_phrase():
    failure_phrases = BOT_CONFIG['failure_phrases']
    return random.choice(failure_phrases)


def classify_intent(replica):
Esempio n. 55
0
def tfidf_dimReduced(corpus, log_id="<<id>>", number_features=10000):
    """
        Vectorizes the corpus using -IDF metric and then reduces the dimension of the features to number_features.
        Vectorizer results are normaliTFzed. Since LSA/SVD results are not normalized, we have to redo the normalization.
    
    Parameters
    ----------------
    corpus (array): array of text documents.
    login_id [optional] (string): id to display in the logs. Usually represents the fold. 
    number_features [optional](int): numbers of features to keep after SVD.
    
    Returns
    ----------------
    lsa (object): the learned model.
    X (object): the matrix of learned features representing the corpus.
    """

    wnl = nltk.stem.PorterStemmer()

    logging.info(log_id + "Running Stemming...")
    t0 = time()

    corporea = []
    for doc in corpus:
        new_doc = []
        # removing autism and asd words.
        doc = doc.lower().replace("autism", "").replace("asd", "")

        tokens = nltk.word_tokenize(doc)
        for token in tokens:
            new_doc.append(wnl.stem(token))
        corporea.append(' '.join(new_doc))

    logging.info(log_id + "Stemming done in %fs" % (time() - t0))

    logging.info(log_id + "Running tfidf...")
    t0 = time()

    vectorizer = TfidfVectorizer(
        max_df=0.5,  #max_features=20000,
        min_df=1,
        stop_words='english',
        use_idf=True)

    logging.info(log_id + "tf-idf done in %fs" % (time() - t0))

    logging.info(log_id + "Running SVD Dim Reduction...")
    t0 = time()
    svd = TruncatedSVD(number_features)

    normalizer = Normalizer(copy=False)

    lsa = make_pipeline(vectorizer, svd, normalizer)

    X = lsa.fit_transform(corporea)

    explained_variance = svd.explained_variance_ratio_.sum()

    logging.info(
        log_id +
        "SVD explained variance: {}%".format(int(explained_variance * 100)))
    logging.info(log_id + "SVD done in %fs" % (time() - t0))

    return lsa, X
Esempio n. 56
0
class LSI(GenericModel):
    def __init__(self, **kwargs):
        self._svd_matrix = None
        self._query_vector = None

        self.vectorizer = None
        self.svd_model = None

        super().__init__()

        self.similarity_measure = None

        self.set_basic_params(**kwargs)
        self.set_vectorizer(**kwargs)
        self.set_svd_model(**kwargs)

    def set_name(self, name):
        super().set_name(name)

    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)

    def set_basic_params(self, **kwargs):
        self.set_name('LSI' if LSI_Model_Hyperp.NAME.value not in
                      kwargs.keys() else kwargs[LSI_Model_Hyperp.NAME.value])
        self.set_similarity_measure(SimilarityMeasure.COSINE)
        self.set_model_gen_name('lsi')

    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure

    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(
            stop_words='english', use_idf=True, smooth_idf=True
        ) if LSI_Model_Hyperp.VECTORIZER.value not in kwargs.keys(
        ) else kwargs[LSI_Model_Hyperp.VECTORIZER.value]

        vec_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__vectorizer__' in key
        }
        self.vectorizer.set_params(**vec_params)

    def set_svd_model(self, **kwargs):
        self.svd_model = TruncatedSVD(
            n_components=100,
            algorithm='randomized',
            n_iter=10,
            random_state=42
        ) if LSI_Model_Hyperp.SVD_MODEL.value not in kwargs.keys() else kwargs[
            LSI_Model_Hyperp.SVD_MODEL.value]

        svd_model_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__svd_model__' in key
        }
        self.svd_model.set_params(**svd_model_params)

    def recover_links(self, corpus, query, test_cases_names,
                      bug_reports_names):
        starttime = time.time()

        if self.similarity_measure == SimilarityMeasure.COSINE:
            self._recover_links_cosine(corpus, query, test_cases_names,
                                       bug_reports_names)

        elif self.similarity_measure == SimilarityMeasure.JACCARD_INDEX:
            self._recover_links_jaccard(corpus, query, test_cases_names,
                                        bug_reports_names)

        elif self.similarity_measure == SimilarityMeasure.EDIT_DISTANCE:
            self._recover_links_edit(corpus, query, test_cases_names,
                                     bug_reports_names)

        self._record_docs_feats(corpus, query, test_cases_names,
                                bug_reports_names)

        endtime = time.time()

        print(
            f' ..Total processing time: {round(endtime-starttime, 2)} seconds',
        )

    def _record_docs_feats(self, corpus, query, test_cases_names,
                           bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)

        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)

        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl'])

        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl

        for br_name, mrw in self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw

        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl

    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = WordNetBased_LemmaTokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list

    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = []  # list of tuples (artf_name, mrw_list={})

        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS, 1].values)
            mrw_list.append((artf_name, mrw))

        return mrw_list

    def _recover_links_cosine(self, corpus, query, test_cases_names,
                              bug_reports_names):
        svd_transformer = Pipeline([('vec', self.vectorizer),
                                    ('svd', self.svd_model)])

        self._svd_matrix = svd_transformer.fit_transform(corpus)
        self._query_vector = svd_transformer.transform(query)
        self._sim_matrix = pairwise.cosine_similarity(X=self._svd_matrix,
                                                      Y=self._query_vector)

        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix,
                                        index=test_cases_names,
                                        columns=bug_reports_names)

    def _recover_links_jaccard(self, corpus, query, test_cases_names,
                               bug_reports_names):
        tokenizer = self.vectorizer.tokenizer

        corpus_tokens = [tokenizer.__call__(doc) for doc in corpus]
        query_tokens = [tokenizer.__call__(doc) for doc in query]

        self._sim_matrix = pd.DataFrame(index=test_cases_names,
                                        columns=bug_reports_names,
                                        data=np.zeros(
                                            shape=(len(test_cases_names),
                                                   len(bug_reports_names)),
                                            dtype='int8'))

        for br_id, doc_query_tset in zip(bug_reports_names, query_tokens):
            for tc_id, doc_corpus_tset in zip(test_cases_names, corpus_tokens):
                self._sim_matrix.at[tc_id, br_id] = nltk.jaccard_distance(
                    set(doc_corpus_tset), set(doc_query_tset))

    def _recover_links_edit(self, corpus, query, test_cases_names,
                            bug_reports_names):
        self._sim_matrix = pd.DataFrame(index=test_cases_names,
                                        columns=bug_reports_names,
                                        data=np.zeros(
                                            shape=(len(test_cases_names),
                                                   len(bug_reports_names)),
                                            dtype='int8'))

        for br_id, doc_query in zip(bug_reports_names, query):
            for tc_id, doc_corpus in zip(test_cases_names, corpus):
                self._sim_matrix.at[tc_id, br_id] = nltk.edit_distance(
                    doc_corpus, doc_query)

        normalizer = Normalizer(copy=False).fit(self._sim_matrix.values)
        self._sim_matrix = pd.DataFrame(data=normalizer.transform(
            self._sim_matrix.values),
                                        index=test_cases_names,
                                        columns=bug_reports_names)

    def model_setup(self):
        return {
            "Setup": [{
                "Name": self.get_name()
            }, {
                "Similarity Measure": self.get_similarity_measure()
            }, {
                "SVD Model": self.svd_model.get_params()
            }, {
                "Vectorizer": self.vectorizer.get_params()
            }, {
                "Vectorizer Type": type(self.vectorizer)
            }]
        }

    def get_query_vector(self):
        return self._query_vector

    def get_svd_matrix(self):
        return self._svd_matrix

    def get_vectorizer_type(self):
        return type(self.vectorizer)

    def get_tokenizer_type(self):
        return type(self.vectorizer.tokenizer)

    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()

    def get_similarity_measure(self):
        return self.similarity_measure

    def get_sim_matrix(self):
        return super().get_sim_matrix()

    def save_sim_matrix(self):
        super().save_sim_matrix()
 def buildTfIdf(self):
     self.tfIdf = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.7)
     self.vectors = self.tfIdf.fit_transform(self.documentsDf['content'])
Esempio n. 58
0
# In[5]:


# sentence pair

#for c in range(len(corpus)):
#    corpus[c] = pre_process(corpus[c])
#    corpus[c] = lemmatize_sentence(corpus[c])
#    print(corpus[c])


# In[6]:


# creating vocabulary using uni-gram and bi-gram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_vectorizer.fit(corpus)


# In[38]:


# Importing the two csv files as dataframes (the original and the modified one)
df1 = pandas.read_csv('LongestDf.csv')
df2 = pandas.read_csv('Frames_caption.csv')


# In[39]:

class TextProcessor(object):
    def __init__(self, repo):
        self.repo = repo

    def loadAllDocumets(self):
        documents = self.repo.search('', False)
        documents = [(self.preProcess(d.description), 'Advertiser-friendly' if d.isAdvertizerFriendly == '\x01' else 'Not suitable for ads') for d in documents]
        self.documentsDf = pd.DataFrame(documents, columns = ['content', 'label'])

    def preProcess(self, text):
        #normalize case
        result = text.lower()

        #remove html tags
        result = re.sub('<.*?>', '', result)

        #remove links
        result = re.sub('(www|http)\S+', '', result)

        #remove words with numbers in them
        result = re.sub(r'\w*\d\w*', '', result)

        #remove punctuation and special characters from words 
        #(but leave apostrophre as it will be easier to remove stopwords)
        whitelist = set("abcdefghijklmnopqrstuvwxyz '")
        result = ''.join(filter(whitelist.__contains__, result))

        words = result.split()

        #remove stop-words
        stop_words = stopwords.words('english')
        words = [w for w in words if not w in stop_words]

        #stem words 
        porter = PorterStemmer()
        words = [porter.stem(w) for w in words]

        return ' '.join(words)

    def analyze(self, textToAnalyze):
        self.loadAllDocumets()

        self.encoder = LabelEncoder()
        self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label'])

        self.buildTfIdf()
        
        preProcessed = self.preProcess(textToAnalyze)
        prediction = self.predict(preProcessed).tolist()[0]
        scores = self.get_scores(self.tfIdf, self.vectors)

        index = prediction.index(max(prediction))
        label = self.encoder.inverse_transform([index])[0]

        return ProcessingResult (preProcessed, scores, label, max(prediction))

    def buildTfIdf(self):
        self.tfIdf = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.7)
        self.vectors = self.tfIdf.fit_transform(self.documentsDf['content'])

    def predict(self, testString):
        naive = MultinomialNB()
        naive.fit(self.vectors, self.documentsDf['label'])

        testDocumentDf = pd.DataFrame([(testString)], columns = ['content'])
        testData = self.tfIdf.transform(testDocumentDf['content'])

        prediction = naive.predict_proba(testData)
        return prediction

    def get_scores(self, vectorizer, vectors):
        scores = zip(vectorizer.get_feature_names(),
                     np.asarray(vectors.sum(axis=0)).ravel())
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        formatted_scores = ["{}: {}".format(item[0], item[1]) for item in sorted_scores[:200]]
        return formatted_scores

    def getMetrics(self):
        self.loadAllDocumets()

        self.encoder = LabelEncoder()
        self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label'])

        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(self.documentsDf['content'],self.documentsDf['label'],test_size=0.3)

        Tfidf_vect = TfidfVectorizer(max_features=5000)
        Tfidf_vect.fit(self.documentsDf['content'])
        Train_X_Tfidf = Tfidf_vect.transform(Train_X)
        Test_X_Tfidf = Tfidf_vect.transform(Test_X)

        # fit the training dataset on the NB classifier
        Naive = MultinomialNB()
        Naive.fit(Train_X_Tfidf,Train_Y)

        # predict the labels on validation dataset
        predictions_NB = Naive.predict(Test_X_Tfidf)

        # Use accuracy_score function to get the accuracy
        acuracy = accuracy_score(predictions_NB, Test_Y)
        recall = recall_score(predictions_NB, Test_Y)
        roc_auc = roc_auc_score(predictions_NB, Test_Y)
        precision = average_precision_score(predictions_NB, Test_Y)
        f1 = f1_score(predictions_NB, Test_Y)

        return Metrics(acuracy, recall, roc_auc, precision, f1)

    def buildRocCurve(self):
        self.loadAllDocumets()

        self.encoder = LabelEncoder()
        self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label'])

        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(self.documentsDf['content'],self.documentsDf['label'],test_size=0.3)

        Tfidf_vect = TfidfVectorizer(max_features=5000)
        Tfidf_vect.fit(self.documentsDf['content'])
        Train_X_Tfidf = Tfidf_vect.transform(Train_X)
        Test_X_Tfidf = Tfidf_vect.transform(Test_X)

        # fit the training dataset on the NB classifier
        Naive = MultinomialNB()
        Naive.fit(Train_X_Tfidf,Train_Y)

        # predict the labels on validation dataset
        predictions_NB = Naive.predict(Test_X_Tfidf)

        # Compute fpr, tpr, thresholds and roc auc
        fpr, tpr, thresholds = roc_curve(predictions_NB, Test_Y)
        roc_auc = roc_auc_score(predictions_NB, Test_Y)

        # Plot ROC curve
        plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        #plt.show()

        labels =  [0,1]
        cm = confusion_matrix(predictions_NB, Test_Y, labels)
        print(cm)
        fig = plt.figure()
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm)

        for i in range(len(cm)):
            for j in range(len(cm[0])):
                c = cm[j,i]
                ax.text(i, j, str(c), va='center', ha='center', bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
      
        plt.title('Confusion matrix of the classifier')
        fig.colorbar(cax)
        ax.set_xticklabels([''] + ['Not suitable', 'Ad-friendly'])
        ax.set_yticklabels([''] + ['Not suitable', 'Ad-friendly'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()


    def buildCompressionChart(self):
        documents = self.repo.search('', False)
        processedDocuments = [self.preProcess(d.description) for d in documents]
        
        compressionResults = [None] * len(documents)

        for i in range(len(documents)):
            words1 = len(re.findall(r'\w+', documents[i].description))
            words2 = len(re.findall(r'\w+', processedDocuments[i]))
            compressionResults[i] = CompressionResult(words1, len(documents[i].description), words2, len(processedDocuments[i]))

        plt.hist([r.wordsCompressionRate for r in compressionResults], bins=25, histtype='bar', ec='black') 
        plt.title('Compression Rate (Words)')
        plt.xlabel('Compression Rate, %')
        plt.ylabel('Number of Documents')

        fig = plt.figure()
        plt.hist([r.charsCompressionRate for r in compressionResults], bins=25, histtype='bar', ec='black') 
        plt.title('Compression Rate (Characters)')
        plt.xlabel('Compression Rate, %')
        plt.ylabel('Number of Documents')
        plt.show()
Esempio n. 60
0
# Library
from preprocess_data import read_predata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# count = CountVectorizer(analyzer='word', stop_words='english')
score = TfidfVectorizer(analyzer='word', stop_words='english')
# movies_matrix = count.fit_transform(read_predata()['list_bag'])
movies_matrix = score.fit_transform(read_predata()['list_bag'])

# cosine_sim = cosine_similarity(movies_matrix)
cosine_sim = linear_kernel(movies_matrix, movies_matrix)


def get_title_from_index(index):
    return read_predata()[read_predata()['Unnamed: 0'] ==
                          index]["movie"].values[0]


def get_index_from_title(title):
    return read_predata()[read_predata()["movie"] ==
                          title]["Unnamed: 0"].values[0]


def recommendations(movie_user_likes):
    movie_index = get_index_from_title(movie_user_likes)
    similar_movies = list(enumerate(cosine_sim[movie_index]))

    sorted_similar_movies = sorted(similar_movies,
                                   key=lambda x: x[1],
                                   reverse=True)[0:6]