def train_and_predict_m8 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Ridge Classifer...")
    clf = RidgeClassifier(class_weight = 'auto', alpha = 1, normalize = True)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'alpha' : [0.1, 0.3, 1, 3, 10], 'normalize' : [True, False]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def train_and_predict_m1 (train, test, labels) :
    print ("Training M1 (randomState = %d ...", randomState)
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM1, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    vectorizer = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    vectorizer.fit(trainData)
    X =  vectorizer.transform(trainData) 
    X_test = vectorizer.transform(testData)
    
    ## Use Stemmer post TF-IDF to check if things change
    # print (X)
    print ("X.shape: ", X.shape)
    print ("X_test.shape: ", X_test.shape)

    ## Create the pipeline 
		# 07/02 - RandomizedPCA/PCA does not work on sparse input (so cannot be applied on output of Vectorizer)
		# JimingYe says LDA did not give much benefit.
    clf = Pipeline([('svd', TruncatedSVD(random_state = randomState, n_components = 330)),
    						 						('scl', StandardScaler()),
                    	     ('svm', SVC(random_state = randomState, cache_size = 500, C = 12))])

    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
     param_grid = {'svd__n_components' : [200, 250, 300], 'svm__C': [10, 12]}
Esempio n. 3
0
def tfidf_ize(train, test, node_info):
    vectorizer = TfidfVectorizer(ngram_range=(1,1))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    vectorizer = TfidfVectorizer(ngram_range=(2,2))
    vectorizer.fit(node_info.abstract.as_matrix())
    
    for table in [train, test]:
        table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna(''))
        table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna(''))
        table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna(''))
        table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna(''))
        
        #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1)
        table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel
        table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1)
        table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \
                        + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1)
    
    return train, test
Esempio n. 4
0
    def fit(self, comments, y=None):
        # get the google bad word list
        # with open("google_badlist.txt") as f:
        with open("my_badlist.txt") as f:
            badwords = [l.strip() for l in f.readlines()]
        self.badwords_ = badwords

        print("vecorizing")
        if self.word:
            if self.tokenizer_func != None:

                def build_tokenizer(func):
                    regexp = re.compile(ur"\b\w\w+\b")
                    tokenizer = lambda doc: [func(word) for word in regexp.findall(doc)]
                    return tokenizer

                tokenizer = build_tokenizer(self.tokenizer_func)
            else:
                tokenizer = None
            countvect = TfidfVectorizer(ngram_range=self.word_range, binary=False, tokenizer=tokenizer, min_df=2)
            countvect.fit(comments)
            self.countvect = countvect

        if self.char:
            countvect_char = TfidfVectorizer(ngram_range=self.char_range, analyzer="char", binary=False)
            countvect_char.fit(comments)
            self.countvect_char = countvect_char
        return self
def train_and_predict_m3 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')

    """
    # Beautiful soup cleanup and stemming
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True)
    testData = modified_cleanup(test, stemmer, is_train = False)
    """
				
    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'n_iter' : [30, 50, 80, 100, 200],  'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
Esempio n. 6
0
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5):
    """
    Detail:
    Params:
        data_path: data directory
        save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents))
        min_df: lower bound
        max_df: upper bound
    """
    dataset = loadData(data_path)
    documents = []
    for counter, sample in enumerate(dataset):
        filename, contents, highlights = sample
        content_str = ""
        for content in contents:
            if content[-1] != ".":
                content += "."
            content_str += " " + content
        documents.append(content_str)

    tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english'))
    tf_idf_vectorizer.fit(documents)

    with open(save_path, mode="wb") as f:
        pickle.dump(tf_idf_vectorizer,f)

    print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
Esempio n. 7
0
def test_tfidfvectorizer_invalid_idf_attr():
    vect = TfidfVectorizer(use_idf=True)
    vect.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True)
    expected_idf_len = len(vect.idf_)
    invalid_idf = [1.0] * (expected_idf_len + 1)
    assert_raises(ValueError, setattr, copy, 'idf_', invalid_idf)
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
    MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]

    def __init__(self):
        self._vec = TfidfVectorizer(max_df=0.95, min_df=2)

    def get_feature_names(self):

        return [x + "_TFIDF" for x in self._vec.get_feature_names()]

    def get_data_array(self, df):

        return df[self.MEDICAL_KEYWORDS] \
            .apply(lambda x: " ".join(x[x == 1].index), axis=1).values

    def fit(self, df, y=None):
        data_arr = self.get_data_array(df)
        self._vec.fit(data_arr)

        return self

    def transform(self, df):
        data_arr = self.get_data_array(df)

        return self._vec.transform(data_arr).toarray()
Esempio n. 9
0
def ridge_003():
    print('*** CLEANING ***')
    tfidf_wrd = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='word', ngram_range=(1, 3),
                                lowercase=True, stop_words='english', min_df=3, max_df=0.5)
    tfidf_wrd.fit(train_set['tweet'])
    X_train_wrd = tfidf_wrd.transform(train_set['tweet'])
    X_test_wrd = tfidf_wrd.transform(test_set['tweet'])

    tfidf_char = TfidfVectorizer(max_features=10000, strip_accents='unicode', analyzer='char', ngram_range=(4, 10),
                                lowercase=True, stop_words='english', min_df=3, max_df=0.5)
    tfidf_char.fit(train_set['tweet'])
    X_train_char = tfidf_char.transform(train_set['tweet'])
    X_test_char = tfidf_char.transform(test_set['tweet'])

    y_train = np.array(train_set.ix[:, 4:])

    print('*** TRAINING ***')
    mdl_wrd = model.ridge(X_train_wrd, y_train)
    mdl_char = model.ridge(X_train_char, y_train)

    print('*** PREDICTING ***')
    test_prediction_wrd = mdl_wrd.predict(X_test_wrd)
    test_prediction_char = mdl_char.predict(X_test_char)

    test_prediction = (test_prediction_wrd + test_prediction_char) / 2

    print('*** OUTPUTTING ***')
    output('results/ridge_003.csv', test_prediction)
Esempio n. 10
0
 def processEssay(self, testidx, trainidx):
     #process essay
     self.rawdata['essay'] = self.rawdata['essay'].apply(clean)
     self.trdata = self.rawdata['essay'].ix[trainidx]
     self.testdata = self.rawdata['essay'].ix[testidx]
     trainessay = np.array(self.trdata.fillna('Missing'))
     testessay = np.array(self.testdata.fillna('Missing'))
     tfidfEs = TfidfVectorizer(min_df=4,  max_features=500)
     tfidfEs.fit(trainessay)
     #=======================================================================
     # #process need statement
     # self.rawdata['need_statement'] = self.rawdata['need_statement'].apply(clean)
     # self.trdata = self.rawdata['need_statement'].ix[trainidx]
     # self.testdata = self.rawdata['need_statement'].ix[testidx]
     # trainneedst = np.array(self.trdata.fillna('Missing'))
     # testneedst= np.array(self.testdata.fillna('Missing'))
     # tfidfNs = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfNs.fit(trainneedst)
     #  
     # #process short desc
     # self.rawdata['short_description'] = self.rawdata['short_description'].apply(clean)
     # self.trdata = self.rawdata['short_description'].ix[trainidx]
     # self.testdata = self.rawdata['short_description'].ix[testidx]
     # trainshortd = np.array(self.trdata.fillna('Missing'))
     # testshortd= np.array(self.testdata.fillna('Missing'))
     # tfidfSd = TfidfVectorizer(min_df=3,  max_features=20)
     # tfidfSd.fit(trainshortd)
     # 
     # self.exdata_train = sp.hstack((tfidfEs.transform(trainessay),tfidfNs.transform(trainneedst),tfidfSd.transform(trainshortd) ))
     # self.exdata_test =  sp.hstack((tfidfEs.transform(testessay),tfidfNs.transform(testneedst),tfidfSd.transform(testshortd) ))
     #=======================================================================
     self.exdata_train = tfidfEs.transform(trainessay) #only use the essay
     self.exdata_test =  tfidfEs.transform(testessay)
Esempio n. 11
0
def _calculate_tfidf_vectorizer(base_corpus_name=BASE_CORPUS_NAME):
    index_to_token = load_index_to_item(get_index_to_token_path(base_corpus_name))
    token_to_index = {v: k for k, v in index_to_token.items()}
    train_lines = _load_train_lines()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=get_tokens_sequence, vocabulary=token_to_index)
    tfidf_vectorizer.fit(train_lines)
    return tfidf_vectorizer
Esempio n. 12
0
def num_feat_select(n,k):
	tfidf = TfidfVectorizer(max_features=n, strip_accents='unicode', 
		 tokenizer = MyTokenizer(), analyzer='word')


	tfidf.fit(train['tweet'])
	trainf = tfidf.transform(train['tweet'])
	testf = tfidf.transform(test['tweet'])
	trainlab = np.array(train.ix[:,4:])
	knn = neighbors.KNeighborsRegressor(n_neighbors=k)
	knn.fit(trainf,trainlab)
	print 'here'
	tim = time.time();

	n = 10
	pred = []
	for i in range(0,n):
		pred.extend(knn.predict(testf[(i*1000):((i+1)*(1000))]))
		print(i)
	print "time: " + str(time.time() - tim) 

	#RMSE:
	testlab = np.array(test.ix[:,4:])
	err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0))))
	print err        
Esempio n. 13
0
def main():

    print "loading data.."
    traindata = list(np.array(p.read_table('/Users/lyj/Downloads/train.tsv'))[:,2])
    testdata = list(np.array(p.read_table('/Users/lyj/Downloads/test.tsv'))[:,2])
    y = np.array(p.read_table('/Users/lyj/Downloads/train.tsv'))[:,-1]

    tfv = TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',
                          analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)

    rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                               C=1, fit_intercept=True, intercept_scaling=1.0,
                               class_weight=None, random_state=None)

    X_all = traindata + testdata
    lentrain = len(traindata)

    print "fitting pipeline"
    tfv.fit(X_all)
    print "transforming data"
    X_all = tfv.transform(X_all)

    X = X_all[:lentrain]
    X_test = X_all[lentrain:]

    print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(rd, X, y, cv=20, scoring='roc_auc'))

    print "training on full data"
    rd.fit(X,y)
    pred = rd.predict_proba(X_test)[:,1]
    testfile = p.read_csv('/Users/lyj/Downloads/data/test.tsv', sep="\t", na_values=['?'], index_col=1)
    pred_df = p.DataFrame(pred, index=testfile.index, columns=['label'])
    pred_df.to_csv('benchmark.csv')
    print "submission file created.."
class TfidfBuilder:

    def __init__(self, filtered_out_words=[]):
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf = TfidfVectorizer(tokenizer=self.get_tokens)
        self.filtered_out_words = filtered_out_words

    def filter(self, word):
        result = True
        if word in self.filtered_out_words:
            result = False
        return result

    def get_tokens(self, text):
        all_tokens = nltk.word_tokenize(text)
        filtered_tokens = [word for word in all_tokens if self.filter(word)]
        lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in filtered_tokens]
        return lemmatized_tokens

    def to_tfidf(self, documents):
        self.tfidf.fit(documents)
        return self.tfidf

    def to_tfidf_vector(self, document):
        return self.tfidf.transform([document]).toarray()
Esempio n. 15
0
    def scoring(self):
    ''' Scoring articles based on their frequency of usage on Wikipedia '''

        vectorizer = TfidfVectorizer()
        vectorizer.fit(self.articles)
        idf_score = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

        ## Opening pickle of Wikipedia word frequencies
        with open('wiki_freq.pickle', 'r') as f:
            wiki_freq = pickle.load(f)

        for i, doc in enumerate(self.docs):
            total_score = 0
            doc_word_score = []
            for word in doc:
                word_score = 0
                try:
                    word_score = wiki_freq[word]*idf_score[word]
                    total_score += word_score
                    doc_word_score.append((word,word_score))
                except:
                    pass

            doc_word_score.sort(key=lambda x: x[1], reverse=True)
            self.doc_scores.append(total_score/(float(len(doc)+1)))
            self.article_info[i]['topwords'] = doc_word_score[0:25]
            self.article_info[i]['score'] = self.doc_scores[i]
        
        self.article_info.sort(key=lambda x: x['score'], reverse=True)
        self.article_rank = {rank: key for rank, key in enumerate(self.article_info, 1)}
Esempio n. 16
0
    def _train(self, train_data, resources):
        sample_length = len(train_data)
        dict_status_path = os.path.join(root_dic,
                                        'dict_vectorizer_{}.status'.
                                        format(sample_length))
        if os.path.isfile(dict_status_path):
            dictVectorizer = joblib.load(dict_status_path)
        else:
            dictVectorizer = DictVectorizer()
            dictVectorizer.fit(train_data[self.features].
                               fillna(0).
                               to_dict('record'))
            joblib.dump(dictVectorizer, dict_status_path)

        tfidf_status_path = os.path.join(root_dic,
                                         'tfidf_vectorizer_{}.status'.
                                         format(sample_length))
        if os.path.isfile(tfidf_status_path):
            tfidf = joblib.load(tfidf_status_path)
        else:
            tfidf = TfidfVectorizer(min_df=40, max_features=300)
            tfidf.fit(train_data.essay)
            joblib.dump(tfidf, tfidf_status_path)

        resources['dictVectorizer'] = dictVectorizer
        resources['tfidf'] = tfidf
        print 'Head Processing Completed'
        return train_data, resources
def vectorize(data, new_doc, local=False):
    """
    Converts data and new doc to vectors that can be used in KNN
    """

    vectorizer = TfidfVectorizer(use_idf=True)
    glossaries = dict(map(lambda x: (x, data.tag_glossary(x)), data.tags()))
    vectorizer.fit(glossaries.values())

    # Get all glossaries for all tags
    glossary_bows = vectorizer.transform(glossaries.values())
    glossary_bows = dict(zip(glossaries.keys(), glossary_bows))

    zero_vector = sparse.csc_matrix((1, len(vectorizer.get_feature_names())))
    descriptors = []

    doc_tags = map(lambda x: (x[0], x[1]['tags']), data.data['items'].items())

    for key, tags in doc_tags:
        bows = map(lambda x: glossary_bows[x], tags)
        descriptor = (sum(bows) + zero_vector) #/ float(len(tags) + 1)
        descriptors += [(key, descriptor)]

    # Get all tags for the new document
    new_doc_descriptor = sum(map(lambda x: glossary_bows[x], new_doc['tags'])) + zero_vector

    if(local):
        return(descriptors, new_doc_descriptor, vectorizer)

    return(descriptors, new_doc_descriptor)
    def create_vectorizer(self, names):
        # create the transform
        vectorizer = TfidfVectorizer(stop_words='english')
        # tokenize and build vocab
        vectorizer.fit(names)

        return vectorizer
def trainTFIDF2(bow21features, bow2kfold, test):
    idx = (test[0][:, 0]).astype(int)
    tfv = TfidfVectorizer(min_df=5, max_df=500, max_features=None, strip_accents='ascii', analyzer='word',
                          token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=True, smooth_idf=True, sublinear_tf=True,
                          stop_words='english')
    pipeline = Pipeline(
        [('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)),
         ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
         ('svm',
          SVC(C=10.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001,
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))])
    tfidf2CrossValidationTest = None
    if toTestModel:
        tfidf2CrossValidationTest = tfidfCrossValidation(tfv, pipeline, bow2kfold)
    trainData, lblsTrain, testData, lblstest = bow21features
    tfv.fit(trainData)
    X_train = tfv.transform(trainData)
    X_test = tfv.transform(testData)
    if isinstance(lblsTrain, list):
        lblsTrain = lblsTrain[0]
    lblsTrain = (lblsTrain.astype(int))
    pipeline.fit(X_train, lblsTrain)
    predictions = pipeline.predict(X_test)
    finalResults = pd.DataFrame({"id": idx, "prediction": predictions})
    return tfidf2CrossValidationTest, finalResults
Esempio n. 20
0
def main(files):
    word2vec_file_path = files[0]
    output_file_path = files[1]
    input_file_paths = files[2:]
    print 'processing files:', input_file_paths, 'using word vector file', word2vec_file_path
    print 'outputting to', output_file_path
    vocab = []
    line_counter = 0

    vectorizer = TfidfVectorizer(input='filename')
    vectorizer.fit(input_file_paths)
    for word in vectorizer.vocabulary_:
        word = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", word)
        if not (word in vocab):
            vocab.append(word.encode('ascii', 'replace'))

    print "len vocab =", len(vocab)
    with open(output_file_path, 'w') as output_file:
        with open(word2vec_file_path) as word2vec:
            while True:
                line = word2vec.readline()
                if not line:
                    break
                else:
                    tokens = tokenize(line)
                word, vector = tokens[0], tokens[1:]
                word = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", word)
                if word in vocab:
                    output_file.write(word + ' ')
                    for token in vector:
                        output_file.write(token + ' ')
                    output_file.write('\n')
                    del vocab[vocab.index(word)]
                    line_counter += 1
    print 'len file =', line_counter
Esempio n. 21
0
 def extract_tfidf_nmf_feats(self, df_data, n_components):
     """
     Extract tfidf features using nmf.     
     """        
     df_feat = pd.DataFrame(index=range(df_data.shape[0]))
     tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
     tsvd = TruncatedSVD(n_components=n_components, random_state = 2016)
     nmf = NMF(solver='cd', n_components=n_components, init='nndsvda',
                 random_state=0, tol=1e-3)
     df_data['q'].to_csv('q', index=False)
     df_data['t'].to_csv('t', index=False)
     df_data['d'].to_csv('d', index=False)
     print('fitting in tfidf')
     tfidf.set_params(input='filename')        
     tfidf.fit(['q','t','d'])
     tfidf.set_params(input='content')  
     for col in ['d', 't', 'q', 'b']:
         print('process column', col)
         txt = df_data[col]
         tfidf_mat = tfidf.transform(txt)
         nd_feat = nmf.fit_transform(tfidf_mat)
         tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \
                                     for i in range(n_components)])
         df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True)
     saveit(df_feat, 'df_tfidf_nmf_feats')
Esempio n. 22
0
def main():
    twenty = fetch_20newsgroups()
    tfidf = TfidfVectorizer().fit_transform(twenty.data)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print related_docs_indices
    print cosine_similarities[related_docs_indices]
    # vectorizer = CountVectorizer(min_df=1)
    # corpus = [
    # 'This is the first document.',
    # 'This is the second second document.',
    # 'And the third one.',
    # 'Is this the first document?',
    # ]

    # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    # tfs = tfidf.fit_transform(token_dict.values())

    train_set = ("The sky is blue.", "The sun is bright.")
    test_set = ("The sun in the sky is bright.",
                "We can see the shining sun, the bright sun.")
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform(train_set)
    print "Vocabulary:", count_vectorizer.vocabulary
    # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
    freq_term_matrix = count_vectorizer.transform(test_set)
    print freq_term_matrix.todense()
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    print "IDF:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    print tf_idf_matrix.todense()
Esempio n. 23
0
def get_vectorizer():
    if os.path.isfile(j_vec):
        vec = joblib.load(j_vec)
        return vec
    touch(j_vec)
    # load french stop words list
    STOPWORDS = []
    with open("stop-words_french_1_fr.txt", "r") as f:
        STOPWORDS += f.read().split("\n")
    with open("stop-words_french_2_fr.txt", "r") as f:
        STOPWORDS += f.read().split("\n")
    STOPWORDS = set(STOPWORDS)
    vec = TfidfVectorizer(
        min_df=1,
        max_features=123456,
        stop_words=STOPWORDS,
        strip_accents="unicode",
        smooth_idf=True,
        norm="l2",
        sublinear_tf=False,
        use_idf=True,
        ngram_range=(1, 3),
    )
    df_test = pd.read_csv(f_test, sep=";")
    vec.fit(iterText(df_test))
    joblib.dump(vec, j_vec)
    return vec
def train_and_predict_m6 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM6, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting K-Nearest Neighbors...")
    clf = KNeighborsClassifier(p = 2, n_neighbors = 5)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'n_neighbors' : [3, 4, 5, 6, 7], 'weights' : ['uniform', 'distance'], 'leaf_size' : [1, 3, 5, 10] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def vectorize(data, new_doc, local = False):
    """
    Vectorize the data as described in file docstring.
    """
    # Generator for all glossaries
    glossaries = lambda: (data.tag_glossary(t) for t in data.tags())

    # Create the bag of words descriptors for each glossary
    vectorizer = TfidfVectorizer(use_idf=True)
    vectorizer.fit(glossaries())
    tag_bows = dict(zip(data.tags(), vectorizer.transform(glossaries())))

    # Count the number of occurences for each tag
    tag_counter = Counter()
    for i in data.items(): tag_counter.update(data.item(i)['tags'])
        
    # Generator for lists of tags for each item
    item_tags = (data.item(i)['tags'] for i in data.items())

    # The number of dimensions in the bow vector
    v_dim = len(vectorizer.get_feature_names())
    # lambda function to create descriptors
    create_desc = lambda x: create_descriptor(x, tag_bows, tag_counter, 
                                              v_dim, len(data.data['items']))

    # Create descriptors for all known documents and new document
    item_descriptors = [create_desc(tags) for tags in  item_tags]
    new_doc_descriptor = create_desc(new_doc['tags'])
    
    # For analysis or use in other vectorizers, also return the vectorizer itself
    if(local):
        return (zip(data.items(), item_descriptors), new_doc_descriptor, vectorizer)

    # Asssociate document ids with descriptors and return.
    return(zip(data.items(), item_descriptors), new_doc_descriptor)
def train_and_predict_m7 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Passive-Aggressive Classifer...")
    clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
    #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def train_and_predict_m4 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM4, stemmer_type = 'porter')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = LogisticRegression(random_state = randomState, penalty = 'l2', C = 12, class_weight = 'auto')
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    #param_grid = {'C' : [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 30], 'penalty' : ['l2']}
    param_grid = {'C' : [1, 3, 5, 6, 7, 8, 10, 11, 12], 'penalty' : ['l2']}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def train_and_predict_m5 (train, test, labels) :
		# Beautiful soup cleanup and stemming (just to mix it up)
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True, pretag = 'full')
    testData = modified_cleanup(test, stemmer, is_train = False, pretag = 'full')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Multinominal Naive Bayes...")
    clf = MultinomialNB(alpha = 0.03)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    # param_grid = {'alpha' : [0.01, 0.03, 0.1, 0.3, 1]}
    param_grid = {'alpha' : [0.01, 0.03]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
def calc_tfidf_cosine(file_name):
	print "calculating cosine similarity"
	data = pd.read_csv(file_name)
	prod_titles = list(data.apply(lambda x:'%s' % (x['product_title']),axis=1))
	queries = list(data.apply(lambda x:'%s' % (x['query']),axis=1))

	# after you konw the rest of this is workign
	# to be supplied in stop_words as a **LIST**
	# 	stoplist = set('for a of the and to in with an on oz lbs. lbs ft ft. in. ml inch cu. cu ft. ft up cm oz. mm ounce'.split())

	# this improved score so using custom stoplist
	stoplist = list('for a of the and to in with an on oz lbs. lbs ft ft. in. ml inch cu. cu ft. ft up cm oz. mm ounce'.split())

	tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'stoplist')
    
    # Fit TFIDF
	tfv.fit(prod_titles)
	prod_title_tfidf =  tfv.transform(prod_titles) 
	
	# transpose for matrix multiplication and division
	pt_tfidf_T = np.transpose(prod_title_tfidf)
	print pt_tfidf_T.shape

	tfv.fit(queries)
	query_tfidf = tfv.transform(queries)
	q_tfidf_T = np.transpose(query_tfidf)
	print q_tfidf_T.shape

	cosine_tfidf = cosine_similarity(pt_tfidf_T[0:1], q_tfidf_T[0:1])
	print cosine_tfidf

	return cosine_tfidf
def tfIDFeats(ids,data):


    # the infamous tfidf vectorizer (Do you remember this one?)
    tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    # Fit TFIDF
    tfv.fit(data)
    X =  tfv.transform(data) 
        # Initialize SVD

    svd = TruncatedSVD(n_components=350)
    
    # Initialize the standard scaler 
    scl = StandardScaler( with_mean=False)
    
    
    
    if X.shape[1]>350:
        X = svd.fit_transform(X)
    X = scl.fit_transform(X,ids)
    if plotData:
        X = PCA(n_components=2).fit_transform(X)
    return (X,ids)
x_text = train = X.Text
x_sm = train = X[feature_cols_sm]
x_sm = X[feature_cols_sm]

# #Converting data frame to sparce matrix
x_sm = scipy.sparse.csr_matrix(x_sm.values)
y = dataset.IsCyberbullying  # Target

# # # 1.2) Feature Extraction (Textual Features)

# # The terms' weights were calculated using the Term Frequency - Inverse Document Frequency (TF-IDF)
tfidf_vect = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             max_features=50000)
tfidf_vect.fit(x_text)
x_text_tfidf = tfidf_vect.transform(x_text)

# 1.3) Feature Selection (Textual Features)

# Feature selection  using a chi-square score was applied  for each applied machine learning algorithm to select relevant textual features.

# COMMENT OUT following code block for experimenting different feature sizes for each classifier
clf = clf = svm.SVC()
for x in range(5, 23, 15):
    test = SelectKBest(score_func=chi2, k=x)
    fit = test.fit(x_sm, y)
    x_s = fit.transform(x_sm)
    scores = cross_val_score(clf, x_s, y, cv=10)
    # print(scores)
test = SelectKBest(score_func=chi2, k=15)
Esempio n. 32
0
#f = open("../datasets/classifier2_datasetA_pickle.pkl", "rb")
f = open("../datasets/classifier1_datasetA_Combined_pickle.pkl", "rb")
filter_tweets = pickle.load(f)
y = pickle.load(f)

print(len(filter_tweets))
print(len(y))
def baseform(input):
    ans=[]
    for i in word_tokenize(input):
        ans.append(nltk.wordnet.WordNetLemmatizer().lemmatize(i))
        #Alternative ans.append(PorterStemmer().stem(i))
    return ans
train_X, test_X, train_y, test_y = train_test_split(filter_tweets, y, test_size = 0.25, random_state = 42)
vectorizer = TfidfVectorizer(ngram_range = (1,2),tokenizer=baseform,max_features=2000)#Remove max features parameter for 2nd classifier
vectorizer.fit(train_X, train_y)
print(len(train_X))
print(len(test_X))
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)
print(train_X.shape)
print(test_X.shape)
train_X = train_X.toarray()
test_X = test_X.toarray()

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
clf_RandomForest = RandomForestClassifier(n_estimators=120)
clf_RandomForest.fit(train_X, train_y)
Esempio n. 33
0
print('Done!\n')

## Split data for train and test the model
print(
    "\n\n#############################################################################################################"
)
print("\nTraining Classifier...")
X_train, X_test, y_train, y_test = train_test_split(whole_text,
                                                    class_label,
                                                    train_size=0.75,
                                                    test_size=0.85)

# Vectorization
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5))
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
joblib.dump(vectorizer, filename_vec)

## Deletion of reapeted columns.
variance_filtter = VarianceThreshold()
variance_filtter.fit(X_train, y_train)
joblib.dump(variance_filtter, filename_variance)
X_train = variance_filtter.transform(X_train)
X_test = variance_filtter.transform(X_test)

percentile_filtter = SelectPercentile(f_classif, percentile=50)
percentile_filtter.fit(X_train, y_train)
X_train = percentile_filtter.transform(X_train)
X_test = percentile_filtter.transform(X_test)
def tfidf_transform(weights):
    tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(), norm=None)
    tfidf.fit(weights.Tweet)
    features = pd.Series(tfidf.get_feature_names())
    transformed = tfidf.transform(weights.Tweet)
    return features, transformed
Esempio n. 35
0
f1 = open(fpTestWLabel, 'r')
arrLabels = f1.read().strip().split('\n')
f1.close()
f1 = open(fpTestWLocation, 'r')
arrLocations = f1.read().strip().split('\n')
f1.close()

for i in range(0, len(arrItems)):
    item = arrItems[i]
    X_TestW.append(item)
    y_TestW.append(arrLabels[i])
    l_TestW.append(arrLocations[i])
    lstAllText.append(item)

vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=1000)
model = vectorizer.fit(lstAllText)
vec_total_all = model.transform(lstAllText).toarray()
vec_train_all = model.transform(X_Train).toarray()
vec_testP_all = model.transform(X_TestP).toarray()
vec_testW_all = model.transform(X_TestW).toarray()
#pca = PCA(n_components=100)
print('prepare to fit transform')
# modelPCA = pca.fit(vec_total_all)
# vec_train=modelPCA.transform(vec_train_all)
# vec_testP=modelPCA.transform(vec_testP_all)
# vec_testW=modelPCA.transform(vec_testW_all)
vec_train = vec_train_all
vec_testP = vec_testP_all
vec_testW = vec_testW_all

print('end fit transform')
Esempio n. 36
0
train = pd.read_csv('C:/Users/Jie.Hu/Desktop/Data Science/Practice/Kaggle_nlp_1/train.csv').fillna("unknown")
test = pd.read_csv('C:/Users/Jie.Hu/Desktop/Data Science/Practice/Kaggle_nlp_1/test.csv').fillna("unknown")

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    #token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3))
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 4))
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_word_features, train_char_features])
test_features = hstack([test_word_features, test_char_features])
Esempio n. 37
0
# comments_test = transform_com.transform(test['comment_text'])
# gc.collect()

word_vect = word_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                              strip_accents='unicode',
                                              analyzer='word',
                                              token_pattern=r'\w{1,}',
                                              ngram_range=(1, 1),
                                              max_features=20000)
char_vect = TfidfVectorizer(sublinear_tf=True,
                            strip_accents='unicode',
                            analyzer='char',
                            ngram_range=(1, 4),
                            max_features=20000)
word_vect.fit(pd.concat([train['comment_text'], test['comment_text']], axis=0))
char_vect.fit(pd.concat([train['comment_text'], test['comment_text']], axis=0))
comments_train_word = word_vect.transform(train_mes)
comments_train_char = char_vect.transform(train_mes)
comments_valid_word = word_vect.transform(valid_mes)
comments_valid_char = char_vect.transform(valid_mes)
comments_test_word = word_vect.transform(test['comment_text'])
comments_test_char = char_vect.transform(test['comment_text'])
comments_train = hstack((comments_train_word, comments_train_char))
comments_valid = hstack((comments_valid_word, comments_valid_char))
comments_test = hstack((comments_test_word, comments_test_char))

import xgboost as xgb
'''
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2, num_rounds=400):
    param = {}
    param['objective'] = 'binary:logistic'
Esempio n. 38
0
    return txt


#print(raw_data.head())

X_train, X_test, Y_train, Y_test = train_test_split(
    raw_data[['text', 'text_length', 'Punct_pc']],
    raw_data['label'],
    test_size=0.2,
    random_state=123)
'''print(pd.crosstab(Y_train,columns = 'label',normalize=True))
print(pd.crosstab(Y_test,columns = 'label',normalize=True))
print(X_train.head())'''

Tfidf_Vect = TfidfVectorizer(analyzer=clean_data)
Tfidf_vect_fit = Tfidf_Vect.fit(X_train['text'])

X_train_Tfidf_vect = Tfidf_vect_fit.transform(X_train['text'])
X_test_Tfidf_vect = Tfidf_vect_fit.transform(X_test['text'])

X_train_vect = pd.concat([
    X_train[['text_length', 'Punct_pc']].reset_index(drop=True),
    pd.DataFrame(X_train_Tfidf_vect.toarray())
],
                         axis=1)

X_test_vect = pd.concat([
    X_test[['text_length', 'Punct_pc']].reset_index(drop=True),
    pd.DataFrame(X_test_Tfidf_vect.toarray())
],
                        axis=1)
#print (xtrain.shape)
#print (xvalid.shape)

# Using TF-IDF as features

tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')



## Fit and transform reviews text to sparse TF-IDF features matrix
			
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)


#Using Bag of Words as features

ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

Esempio n. 40
0
print("Se cargaron datos de clasificación en: " + str(tiempoFinal))

################################################################################
########## Vectorizacion                                              ##########
################################################################################
print("----------------------------------")
print("\n Creando la representacion numerica (vectorizando tweets)")
tiempo0 = time()
# max_features : int or None, default=None
#If not None, build a vocabulary that only consider the top max_features
# ordered by term frequency across the corpus.
# This parameter is ignored if vocabulary is not None.
vectorizador = TfidfVectorizer(max_features=54)
#vectorizador = TfidfVectorizer(max_features=74)
#TfidfVectorizer: Convert a collection of raw documents to a matrix of TF-IDF features.
vectorizador.fit(bolsaDePalabras)
#fit(raw_documents[, y])	Learn vocabulary and idf from training set.
x_matrizSetEntrenamientoVect = vectorizador.fit_transform(
    x_setEntrenamientoTweets)

#Learn vocabulary and idf, return term-document matrix.
#This is equivalent to fit followed by transform, but more efficiently implemented.
#Parameters:	raw_documents : iterable
#an iterable which yields either str, unicode or file objects
#Returns:
#X : sparse matrix, [n_samples, n_features]
#Tf-idf-weighted document-term matrix.

print(x_matrizSetEntrenamientoVect)
#transform(raw_documents, copy=True)[source].
#Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, "kfold"] = f

    # creating test and train dataframes
    for fold_ in range(5):
        train_df = df[df.kfold != fold_].reset_index(drop=True)
        test_df = df[df.kfold == fold_].reset_index(drop=True)

        # countvectorizer initialization
        tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

        # print(cv)

        # fit countvectorizer to the real/fake tweets (tweet text)
        tfidf.fit(train_df.text)

        # transform into sparse term-document matrix
        xtrain = tfidf.transform(train_df.text)
        xtest = tfidf.transform(test_df.text)
        xvalidate = tfidf.transform(validation_df.text)

        # print(xtrain)
        # print("_"*50)
        # print(xtest)

        # Logistic Regression Model
        logistic_model = linear_model.LogisticRegression(solver="liblinear")

        # fit logistic model
        logistic_model.fit(xtrain, train_df.target)
per_word_vocab = dict()
for substitutes_dump in substs_list:
    loader = Substs_loader(data_name,
                           lemmatizing_method='all',
                           drop_duplicates=False,
                           count_lemmas_weights=True)
    df = loader.get_substitutes(substitutes_dump, topk)
    substs_texts = df['substs']

    for word in df.word.unique():
        mask = (df.word == word)
        vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
                              min_df=min_df,
                              max_df=max_df)
        vec.fit(substs_texts[mask])
        words = set([w for w in vec.vocabulary_])
        if word not in per_word_vocab:
            per_word_vocab[word] = words
        else:
            per_word_vocab[word].update(words)

for word in per_word_vocab:
    per_word_vocab[word] = {w: i for i, w in enumerate(per_word_vocab[word])}
data = dict()

for substitutes_dump in substs_list:
    loader = Substs_loader(data_name,
                           lemmatizing_method='all',
                           drop_duplicates=False,
                           count_lemmas_weights=True)
Esempio n. 43
0
    def feature_engineering(self,
                            sentence_df,
                            input_column,
                            output_column,
                            method='tfidf',
                            model_param={}):
        """
        建立文字特徵
        :param sentence_df:
        :param input_column:
        :param output_column:
        :param method:
        :param model_param:
        :return:
        """

        method = method.lower()
        cut_words = sentence_df[input_column]

        print(cut_words)

        if method == 'tfidf':
            """
            TF-IDF(詞頻/逆文檔頻率)是最流行的IR(信息檢索)技術之一
            用於分析單詞在文檔中的重要性。研究表明,83%的基於文本的推薦系統使用TF-IDF
            TF-IDF衡量文檔中文字的重要性
            例如,「the」在任何文檔中都是常用的,因此TF-IDF並不認為「the」對文檔的特性很重要
            相反,IT相關主題使用「python」,TF-IDF認為「python」是識別主題和類別的重要特徵詞
            """

            vectorizer = TfidfVectorizer(norm=None, stop_words=None)

            # 預測時須將新的單字加入重新計算
            if self.transformer:
                # vectorizer.vocabulary = self.transformer.vocabulary_
                x_train_feature = self.transformer.transform(cut_words)
            else:
                self.transformer = vectorizer.fit(cut_words)
                x_train_feature = self.transformer.transform(cut_words)

            # x_train_feature = self.transformer.transform(cut_words)t
            output_value = list(x_train_feature.toarray())

        elif method == 'count':
            """
            Bag of Words是文檔數據的表示模型
            它簡單地計算單詞在文檔中出現的次數
            Bag-of-Words通常通過權衡特殊單詞和相關術語來用於聚類,分類和主題建模
            """

            vectorizer = CountVectorizer(stop_words=None,
                                         token_pattern=r"(?u)\b\w+\b")

            # 預測時須將新的單字加入重新計算
            if self.transformer:
                vectorizer.vocabulary = self.transformer.vocabulary_

            self.transformer = vectorizer.fit(cut_words)
            x_train_feature = self.transformer.transform(cut_words)
            output_value = list(x_train_feature.toarray())

        elif method == 'word2vec':
            """
            Word2vec擅長對相似的單詞進行分組,並根據上下文對單詞的含義進行高度準確的猜測
            它內部有兩種不同的算法:CBoW(Continuous Bag-of-Words)和skip gram模型
            Skip Gram用於預測目標詞的上下文
            CBoW是從上下文預測目標詞
            """
            output_value = []
        else:
            output_value = []

        # np array 轉 dataframe series
        sentence_df[output_column] = output_value

        # # 預測時不用保存特徵處理器
        # if is_training:
        #     # 將特徵處理器存到fs, ML預測時必須使用
        #     self.save_vectorizer(feature_transformer_path, feature_transformer_name)

        return sentence_df
classification = model.predict(X_test)
classification = pd.DataFrame(classification)
classification = classification.set_index(a.index)
a.index
classification.index
list(a)
data = pd.concat([a[['raw_Comments']], classification], axis = 1)
data.head
data.shape

#data.to_excel("Software_Calls_Classification.xlsx")

os.chdir(r"C:\Users\SG185314\Desktop\Education\py\Software")


trftran = tfidfconverter.fit(df["cleaned_Comments"])
y = trftran.fit_transform(df["cleaned_Comments"])

with open('text_classifier', 'wb') as picklefile:  
    pickle.dump(clf, picklefile)

with open('text_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

with open('tfidf_transform', 'wb') as picklefile:  
    pickle.dump(trftran, picklefile)

with open('tfidf_transform', 'rb') as training_model:  
    transformation = pickle.load(training_model)

Esempio n. 45
0
from sklearn.pipeline import Pipeline
from sklearn import svm
import numpy as np

if __name__ == '__main__':
    model_file_name = 'model'
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--training_data_file',
                        type=str,
                        default='train.json')

    args = parser.parse_args()
    training_data_file = args.training_data_file
    with open(training_data_file, 'rb') as f:
        train_data = json.loads(f.read())

    x_train = [t['data'] for t in train_data]
    y_train = [t['label'] for t in train_data]

    tv = TfidfVectorizer(max_features=50000, ngram_range=(1, 5))
    tv.fit(x_train)

    classifier = svm.LinearSVC()
    classifier.fit(tv.transform(x_train), y_train)
    joblib.dump(classifier, model_file_name)

    # Pipeline = ([('vect', TfidfVectorizer()), ('lsvc', svm.LinearSVC())])

    # parameter = {'vect__max_feature': 15000, 'vect__ngram_range': [(1, 5)], 'lsvc': }
    except:
        print(" error:", sys.exc_info(), 'sentence:', sentence)
        return ""


goods_nms = []
goods_nms = datas['goods_nm'].values

with Pool(processes=cpu_count()) as pool:
    goods_nms = pool.map(word_analysis, goods_nms)

# for i, row in datas.iterrows():
#     goods_nms.append(word_analysis(row.goods_nm))
#     if  i%10000 == 0:
#         print(datetime.datetime.now(),': word_analysis:', i)
#

datas['goods_nm_ana'] = goods_nms  # 형태소 분석 완료
datas = datas.drop(columns=['goods_nm'])

ctv = TfidfVectorizer()
ctv.fit(datas['goods_nm_ana'])

with open('cate_suggest_ctv.dump', 'wb') as f:
    pickle.dump(ctv, f, pickle.HIGHEST_PROTOCOL)

with open('cate_suggest_datas.dump', 'wb') as f:
    pickle.dump(datas, f, pickle.HIGHEST_PROTOCOL)

print(datetime.datetime.now(), ':_fin_')
        pearsonr(df_train['severe_toxic'].values, feature_values)[0],
        pearsonr(df_train['obscene'].values, feature_values)[0],
        pearsonr(df_train['threat'].values, feature_values)[0],
        pearsonr(df_train['insult'].values, feature_values)[0],
        pearsonr(df_train['identity_hate'].values, feature_values)[0]
    ))
exit()


#stemmer = PorterStemmer()
#my_tokenizer = lambda sentence: [stemmer.stem(t) for t in wordpunct_tokenize(sentence.lower())]
#my_tokenizer = lambda sentence: [stemmer.stem(t) for t in sentence.lower().split()]
my_tokenizer = lambda s: preprocess_string(s)

char_trigrams = TfidfVectorizer(min_df=10, max_df=0.75, strip_accents='ascii', analyzer='char', ngram_range=(3, 3), sublinear_tf=True)
char_trigrams.fit(list(X_train.values) + list(X_test.values))

# TODO min_df=3 ?!? (Helps score but it seems wrong)
#word_vect = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, stop_words='english', ngram_range=(2, 2))
word_ngrams = TfidfVectorizer(min_df=3, max_df=0.75, strip_accents='unicode', analyzer='word', tokenizer=my_tokenizer, sublinear_tf=True, stop_words='english', ngram_range=(1, 2))
word_ngrams.fit(list(X_train.values) + list(X_test.values))

#word_unigrams = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', tokenizer=my_tokenizer, sublinear_tf=True, stop_words='english')
#word_unigrams = TfidfVectorizer(min_df=50, max_df=0.75, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, stop_words='english')
#word_unigrams.fit(list(X_train.values) + list(X_test.values))

# TODO Can we do something with PCA here ?
#pca = PCA()

def calc_feature_sparse(data, feature_function):
    return csr_matrix(np.reshape(data.map(feature_function).values, (data.shape[0], 1)))
Esempio n. 48
0
    Corpus.loc[index, 'text_final'] = str(Final_words)

#print(Corpus['text_final'].head())

# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
    Corpus['text_final'], Corpus['label'], test_size=0.3)

# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Step - 4: Vectorize the words by using TF-IDF Vectorizer - This is done to find how important a word in document is in comaprison to the corpus
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf, Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
Esempio n. 49
0
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

train = pd.read_csv('data/raw/train.csv.zip').fillna(' ')
test = pd.read_csv('data/raw/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

train_text_flagged = train[train.iloc[:, 2:].sum(axis=1) > 0]['comment_text']

flagged_word_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                          strip_accents='unicode',
                                          analyzer='word',
                                          token_pattern=r'\w{1,}',
                                          stop_words='english',
                                          ngram_range=(1, 2),
                                          max_features=20000)
flagged_word_vectorizer.fit(train_text_flagged)
print('Finished fitting flagged_word_vectorizer')
train_flagged_features = flagged_word_vectorizer.transform(train_text)
test_flagged_features = flagged_word_vectorizer.transform(test_text)
print('Finished transforming flagged_word_vectorizer')

pickle.dump(train_flagged_features,
            open('src/data/train_flagged_features.sav', 'wb'))
pickle.dump(test_flagged_features,
            open('src/data/test_flagged_features.sav', 'wb'))
Esempio n. 50
0
# Split data into train and validation and create TF-IDF vectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_x, valid_x, train_y, valid_y = \
    train_test_split(df['clean_text'], df['product'], \
    test_size=0.2, random_state=42)
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    # Regex: '\w{1,}' = 1+ ASCII char, digits, or underscore
tfidf_vect.fit(df['clean_text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# Logistic regression

from sklearn.linear_model import LogisticRegression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
     penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
     verbose=0, warm_start=False)
model = LogisticRegression().fit(xtrain_tfidf, train_y)

# Compute model accuracy and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(classification_report(valid_y, model.predict(xvalid_tfidf), \
Esempio n. 51
0
    trainDF['tweets'],
    trainDF['labels'],
    train_size=0.8,
    test_size=0.2,
    stratify=trainDF['labels'])

#encode test_y and train_y
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             max_features=280)
tfidf_vect.fit(trainDF['tweets'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)

# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['tweets'])
# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xtest_count = count_vect.transform(test_x)

#recuperer le code de la classe personal
codeSubj = 0
for i, item in enumerate(encoder.classes_):
    if (item == 'personal'):
        codeSubj = i
t_start = time.time()
"""=====================================================================================================================
1 数据预处理
"""
df_train = pd.read_csv('../data/train_set.csv')
df_train.drop(columns='article', inplace=True)
df_test = pd.read_csv('../data/test_set.csv')
df_test.drop(columns='article', inplace=True)
df_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
y_train = (df_train['class'] - 1).values
"""=====================================================================================================================
2 特征工程
"""
vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             min_df=3,
                             max_df=0.9,
                             max_features=1000)
vectorizer.fit(df_all['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])
"""=====================================================================================================================
3 保存至本地
"""
data = (x_train, y_train, x_test)
fp = open('./data_tfidf_1000.pkl', 'wb')
pickle.dump(data, fp)
fp.close()

t_end = time.time()
print("已将原始数据数字化为tfidf特征,共耗时:{}min".format((t_end - t_start) / 60))
Esempio n. 53
0
class CustomTextVectorizer():
    def __init__(self, **args):
        self.data = args['data']
        self.labels = args.get('labels')
        self.stop_words = args.get('stop_words')
        self.fn = args.get('fn')
        self.vecs = None

        if self.fn is None:
            self.fn = 'training_vecs.csv'

        # best so far
        # stop_words = self.stop_words,
        # ngram_range = (2, 3),
        # max_df = .2,
        # max_features = 16000

        # after normalization & new stop words:
        # stop_words = self.stop_words,
        # ngram_range = (2, 3),
        # max_df = .7,
        # max_features = 128000
        # ~.5 precision & recall on test data

        # works with culling
        # analyzer = 'char',
        # stop_words = self.stop_words,
        # ngram_range = (3, 4),
        # max_df = .7,
        # max_features = 6000
        # 48k features best on train set.
        if args.get('vectorizer') is not None:
            self.vectorizer = args.get('vectorizer')
        else:
            self.vectorizer = TfidfVectorizer(stop_words=self.stop_words,
                                              ngram_range=(2, 3),
                                              max_df=.9,
                                              max_features=16000)

        self.vectorizer.fit(self.data['text'])
        print(len(self.vectorizer.vocabulary_.items()))

    def vectorize(self, data):
        self.vecs = self.vectorizer.transform(data)
        return self.vectorizer

    def write(self):
        with open(os.path.abspath(os.path.dirname(__file__) + './' + self.fn),
                  'w') as outf:
            outf.write('id,' + ','.join(
                ['f' + str(i)
                 for i in range(len(self.vecs.toarray()[0]))]) + ',label\n')
            for index, itm in enumerate(self.vecs.toarray()):
                current_row = str(self.data['id'][index]) + ',' + ','.join(
                    list(str(f) for f in itm))
                if self.labels is not None:
                    current_row += ',' + str(int(self.labels[index]))
                outf.write(current_row + '\n')
            outf.close()

    def dump(self):
        return self.vecs
Esempio n. 54
0
import matplotlib.pyplot as plt
import numpy as np

#3k1 is the source file
f = open("3k3.json", "r")
content = f.readlines()
f.close()

#get text from tweets
l = []
for i in range(len(content)):
    k = content[i].split(",")
    l.append(k[3][7:])

vectorizer = TfidfVectorizer()
vectorizer.fit(l)
vector = vectorizer.transform([l[0]])

#reshape the vector to 2d
if len(vectorizer.idf_) % 2 == 1:
    a = vectorizer.idf_[:-1]
else:
    a = vectorizer.idf_
c = a.reshape((int(len(a) / 2), 2))

kmeans = KMeans(n_clusters=3)
kmeans.fit(c)
r = kmeans.cluster_centers_
plt.scatter(c[:, 0], c[:, 1], s=50, c='b')
plt.scatter(r[0][0], r[0][1], s=200, c='g', marker='s')
plt.scatter(r[1][0], r[1][1], s=200, c='r', marker='s')
def namestring_match(external_names,
                     internal_names,
                     THRESH_1,
                     THRESH_2,
                     doBestMatch=False,
                     loadPriorRoughmatch=False,
                     save_path='matching__temp.json'):
    # get feature vectors for vocab_subset + sequela

    if not (loadPriorRoughmatch):

        start_time = time.time()

        vectorizer = TfidfVectorizer(
            min_df=1, analyzer=ngrams
        )  # note that authors with many publications will have bigger idf components

        vectorizer.fit(internal_names)

        rough_edgelist = []  # (external name, internal name)

        curPosition = 0
        blockSize = 50000
        N_vocab = len(internal_names)

        #  estimated  30 min for entire list
        while curPosition < N_vocab:

            print("cur block: {} : {}".format(curPosition,
                                              curPosition + blockSize))

            # get next block of embedding words to match
            if (curPosition + blockSize) < N_vocab:
                vocab_subset = internal_names[curPosition:(
                    curPosition +
                    blockSize)]  # todo loop over vocab by subsetting in blocks
            else:
                vocab_subset = internal_names[
                    curPosition:]  # avoid idx overflow

            combined_words = external_names + vocab_subset

            # create feature vector for this block
            tf_idf_matrix = vectorizer.transform(
                combined_words).toarray()  # rows are L2 normalized

            external_features = tf_idf_matrix[:len(
                external_names), :]  # first portion of concatenated list
            internal_features = tf_idf_matrix[len(
                external_names):, :]  # second portion of concatenated list
            similarity_matrix = np.matmul(
                external_features, internal_features.T
            )  # match external strings to internal strings
            #similarity_scores = similarity_matrix.flatten()

            # first-pass cutoff
            #cutoff = np.percentile(similarity_scores, P_THRESH_1)  # consider iterative filtering
            cutoff = THRESH_1
            print("cutoff: {}".format(cutoff))
            for i_row, ext_name in enumerate(
                    external_names):  # for each disease in the sequelae list
                internal_idxs = np.where(
                    similarity_matrix[i_row, :] > cutoff)[0]
                for i in internal_idxs:
                    int_name = vocab_subset[i]
                    rough_edgelist.append((ext_name, int_name))
                    if PRINT_VERBOSE:  # just print a few of these to get a flavor
                        print(ext_name, ':', int_name)

            loopTime = time.time()
            print("elapsed: {}".format(loopTime - start_time))

            curPosition += blockSize

        save_obj = {
            'rough_edgelist': rough_edgelist,
            'external_names': external_names,
            'internal_names': internal_names
        }
        # todo save and export rough matches since this step takes ~30 min
        with open(save_path, 'w') as f:
            json.dump(save_obj, f)

        N_roughmatches = len(rough_edgelist)  # expected ~ 1 500 000
        print("N rough matches: {}".format(
            N_roughmatches))  # if this is big another pass will be necessary

    else:
        with open(save_path, 'r') as f:
            data = json.load(f)
            rough_edgelist = data['rough_edgelist']
            external_names = data['external_names']
            internal_names = data['internal_names']
    #################
    # 2nd filtering step - 2-grams

    print(rough_edgelist[:200])

    # filter as a second pass

    vectorizer_2 = TfidfVectorizer(min_df=1, analyzer=ngrams_finetooth)
    #
    secondpass_words = external_names + [e[1] for e in rough_edgelist]
    #  build data structure for fast indexing:
    d_externalIdxs, d_internalIdxs = {}, {}
    L_ext = len(external_names)
    for i, w in enumerate(secondpass_words[:L_ext]):
        d_externalIdxs[w] = i
    for i, w in enumerate(secondpass_words[L_ext:]):
        d_internalIdxs[w] = L_ext + i

    tf_idf_matrix2 = vectorizer_2.fit_transform(
        secondpass_words).toarray()  # rows are L2 normalized

    scores = []
    for i_edge, edge in enumerate(rough_edgelist):
        if not (i_edge % 100000):
            print("edge number {}".format(i_edge))
        ext_name = edge[0]
        int_name = edge[1]
        s_idx = d_externalIdxs[ext_name]
        t_idx = d_internalIdxs[int_name]
        v_s = tf_idf_matrix2[s_idx, :]
        v_t = tf_idf_matrix2[t_idx, :]
        similarity_score = 1 - sp.spatial.distance.cosine(v_s, v_t)
        scores.append(similarity_score)

    scores = np.nan_to_num(
        scores)  # catch the ' ' and '3' that slipped through

    # second-pass cutoff
    #cutoff = np.percentile(scores, P_THRESH_2)  # consider iterative filtering
    cutoff = THRESH_2
    print("cutoff: {}".format(cutoff))

    edgelist = []
    # d_matches := external (key) -> internal (value)
    d_matches = defaultdict(list)  # convenience, alternative representation
    for idx, edge in enumerate(rough_edgelist):
        if scores[idx] > cutoff:
            new_edge = (edge[0], edge[1], scores[idx])
            edgelist.append(new_edge)
            d_matches[edge[0]].append((edge[1], scores[idx]))
            print(new_edge)

    if doBestMatch:  # match external name to single best internal name

        edgelist_bestmatch = []
        for key in d_matches:
            scores = [t[1] for t in d_matches[key]]
            tokens = [t[0] for t in d_matches[key]]
            max_score_idx = np.argmax(scores)
            best_token = tokens[max_score_idx]
            edgelist_bestmatch.append((key, best_token, scores[max_score_idx]))

        # todo should probably leave the punctuation in place when matching in prior steps -
        #    currently I am reg-exp'ing it out before finding engrams
        edgelist = edgelist_bestmatch

    save_obj = {
        'rough_edgelist': rough_edgelist,
        'external_names': external_names,
        'internal_names': internal_names,
        'edgelist': edgelist
    }
    with open(save_path, 'w') as f:
        json.dump(save_obj, f)

    return edgelist
Esempio n. 56
0
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    trainDF['tweet'], trainDF['class'], test_size=0.2)

# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['tweet'])

# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             max_features=20000)
tfidf_vect.fit(trainDF['tweet'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',
                                   token_pattern=r'\w{1,}',
                                   ngram_range=(2, 3),
                                   max_features=20000)
tfidf_vect_ngram.fit(trainDF['tweet'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',
                                         token_pattern=r'\w{1,}',
Esempio n. 57
0
def get_accuracy2():
    labels, texts = [], []
    #reading good and Bad dataset file
    with open("Dataset//BadWords.txt") as fp:
        data = fp.readlines()
        for abc in data:
            labels.append("0")
            texts.append(abc)
    with open("Dataset//Goodwords.txt") as fp:
        data = fp.readlines()
        for abc in data:
            labels.append("1")
            texts.append(abc)
    trainDF = pandas.DataFrame()
    trainDF['text'] = texts
    trainDF['label'] = labels

    # split the dataset into training and validation datasets
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
        trainDF['text'], trainDF['label'])

    # label encode the target variable
    encoder = preprocessing.LabelEncoder()
    train_y = encoder.fit_transform(train_y)
    valid_y = encoder.fit_transform(valid_y)

    # create a count vectorizer object
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    count_vect.fit(trainDF['text'])

    # transform the training and validation data using count vectorizer object
    xtrain_count = count_vect.transform(train_x)
    xvalid_count = count_vect.transform(valid_x)

    # word level tf-idf
    tfidf_vect = TfidfVectorizer(analyzer='word',
                                 token_pattern=r'\w{1,}',
                                 max_features=5000)
    tfidf_vect.fit(trainDF['text'])
    xtrain_tfidf = tfidf_vect.transform(train_x)
    xvalid_tfidf = tfidf_vect.transform(valid_x)

    # ngram level tf-idf
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word',
                                       token_pattern=r'\w{1,}',
                                       ngram_range=(2, 3),
                                       max_features=5000)
    tfidf_vect_ngram.fit(trainDF['text'])
    xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
    xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)

    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',
                                             token_pattern=r'\w{1,}',
                                             ngram_range=(2, 3),
                                             max_features=5000)
    tfidf_vect_ngram_chars.fit(trainDF['text'])
    xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
    xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)

    # Naive Bayes on Count Vectors
    accuracy = train_model(valid_y, linear_model.LogisticRegression(),
                           xtrain_count, train_y, xvalid_count)

    stri = ""
    stri = stri + "LC, Count Vectors: " + str(accuracy) + " + "

    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(valid_y, linear_model.LogisticRegression(),
                           xtrain_tfidf, train_y, xvalid_tfidf)
    stri = stri + "LC, WordLevel TF-IDF: " + str(accuracy) + " + "

    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(valid_y, linear_model.LogisticRegression(),
                           xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    stri = stri + "LC, N-Gram Vectors: " + str(accuracy) + " + "

    # Naive Bayes on Character Level TF IDF Vectors
    accuracy = train_model(valid_y, linear_model.LogisticRegression(),
                           xtrain_tfidf_ngram_chars, train_y,
                           xvalid_tfidf_ngram_chars)
    stri = stri + "LC, CharLevel Vectors: " + str(accuracy) + " + "
    return stri
Esempio n. 58
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_train)
# Transform documents to document-term matrix.
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

# preprocessing
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y_train)
Y = multilabel_binarizer.transform(y_train)

tfidf_vect = TfidfVectorizer(analyzer='word', max_features=90000)
tfidf_vect.fit(X_train) # learn vocabulary and idf from training set
X_data_tfidf = tfidf_vect.transform(X_train)
# giả sử không có tập test trước đó 
X_test_tfidf = tfidf_vect.transform(X_test)

y_train

"""**Training a classifier**"""

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

def train_model(classifier, X_train, y_train, X_test):           
    classifier.fit(X_train, y_train)
Esempio n. 59
0
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from operator import itemgetter
corpus = ['This is the first document kaa',
      'This is the second second document.',
      'And the third one.',
      'Is this the first?']

vectorizer = TfidfVectorizer(max_df=0.5,max_features=40)
# print vectorizer
vectorizer.fit(corpus)
vocab = vectorizer.vocabulary_
X_vocab = sorted(vocab.items(),key=itemgetter(1))
# print vocab
# print X_vocab
baseline_vectorizer = CountVectorizer(vocabulary=vocab)
X_base = baseline_vectorizer.fit_transform(corpus).toarray()
# print baseline_vectorizer
print X_base
train = []
for i in range(X_base.shape[0]):
    user_rating = X_base[i].nonzero()[0]
    print user_rating
    train.append((i, user_rating))

print train

# for j in range(4):
# 	item_rating = X_base.tocsc().T[j].nonzero()[1]
# 	print(item_rating)#[2] [0 1] [0 3] [0]
# 	train.append((item_rating[0], j))
#
Esempio n. 60
0
x_train = df['sentence']
y_train = df['label']
# x_train, x_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.01, random_state=42)

# Feature extraction
tfidf_vectorizer1 = TfidfVectorizer(analyzer='word',
                                    stop_words=None,
                                    ngram_range=(1, 3),
                                    min_df=1,
                                    max_features=100000)
tfidf_vectorizer2 = TfidfVectorizer(analyzer='char',
                                    stop_words=None,
                                    ngram_range=(1, 4),
                                    max_features=50000)

tfidf_vectorizer1.fit(df['sentence'].values)
tfidf_vectorizer2.fit(df['sentence'].values)
vec1 = tfidf_vectorizer1.transform(x_train)
vec2 = tfidf_vectorizer2.transform(x_train)
x_train = hstack([vec1, vec2])

# Conctruct model and train
svm = LinearSVC(verbose=True)
print("Start training...")
svm.fit(x_train, y_train)
# y_pred_test = SVM.predict(x_test)

# Prepare predicion data
print(df_test['sentence'].shape)
vec1 = tfidf_vectorizer1.transform(df_test['sentence'])
vec2 = tfidf_vectorizer2.transform(df_test['sentence'])