def n_fold_cross_valid_label(texts, clf, K):
    # doing like function n_fold_cross_valid but we will check the non-correct label
    list_ftr = ftr_sgforum_svc(texts)
    X = np.array(list_ftr[0])
    y = np.array(list_ftr[1])
    word = np.array(list_ftr[2])

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    print cv.n_folds
    for traincv, testcv in cv:

        X_train, X_test = X[traincv], X[testcv]
        y_train, y_test = y[traincv], y[testcv]
        word_train, word_test = word[traincv], word[testcv]

        MIN_DF = 2
        vec = CountVectorizer(token_pattern='[A-Za-z0-9]*', lowercase=True, min_df=MIN_DF)
        vec = vec.fit(X)
        X_train_trans = vec.transform(X_train)
        X_test_trans = vec.transform(X_test)

        clf.fit(X_train_trans, y_train)  # training model
        y_test_pred = clf.predict(X_test_trans)

        for i in range(0, len(y_test_pred)):
            if y_test[i] != y_test_pred[i]:
                print word_test[i]
Example #2
0
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"):  # will vectorize BOG using frequency as the parameter and will return the required arrays
    X_train =[]
    X_test=[]
    Y_train=[]
    Y_test=[]

    for key in data:
        if key in i:

            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_test.append(text)
                Y_test.append(data[key][filename][1])
        else:
            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_train.append(text)
                Y_train.append(data[key][filename][1])
    if tfidf == "False":
        vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)
        return X_train_ans, Y_train, X_test_ans,Y_test
    elif tfidf == "True":
        vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)

        return X_train_ans, Y_train, X_test_ans,Y_test
def trainModel(test_data):
    predictions = dict()
    outcome_list=('DE', 'LT', 'HO', 'DS', 'CA', 'RI', 'OT')
    for o in outcome_list:
        info,outcome=loadData('Outcomes' + '/' + o +'.txt')
        #split data into training dataset      
        train, test, labels_train, labels_test = train_test_split(info, outcome, test_size=0.33)
        counter = CountVectorizer()
        counter.fit(train)
        
        #count the number of times each term appears in a document and transform each doc into a count vector
        counts_train = counter.transform(train)#transform the training data
        counts_test = counter.transform(test_data)#transform the new data

        #build a classifier on the training data        
        LR = LogisticRegression()     
        LR.fit(counts_train,labels_train)        
        #use the classifier to predict on new data
        predicted=LR.predict(counts_test)
        
        #determine prediction results
        if 1 in predicted:
            flag = 'yes'
        else:
            flag = 'no'
        predictions[o] = flag #store result of each outcome
    return predictions
Example #4
0
class BagOfWordView(View):
    """
    View that process words(stemming, lowercasing) and count each word's frequency
    """
    def __init__(self, *args, **kwargs):
        self.count_vec1 = None
        self.count_vec2 = None

        self.tfidf_vec1 = None
        self.tfidf_vec2 = None

        super(BagOfWordView, self).__init__(*args, **kwargs)

    def fit(self, v1, v2, use_idf=False):
        """
        v1, v2: both should be string|unicode, which is required by CountVectorizer.fit
        """
        ## TODO: add `use_tf` option
        self.count_vec1 = CountVectorizer().fit(v1)
        self.count_vec2 = CountVectorizer().fit(v2)

        self.tfidf_vec1 = TfidfTransformer(use_idf=use_idf).fit(
            self.count_vec1.transform(v1))

        self.tfidf_vec2 = TfidfTransformer(use_idf=use_idf).fit(
            self.count_vec2.transform(v2))

        return self

    def transform(self, v1, v2):
        return self.tfidf_vec1.transform(self.count_vec1.transform(v1)), \
            self.tfidf_vec2.transform(self.count_vec2.transform(v2))
Example #5
0
def naive_bayes(x_value, y_value):
    X = x_value
    y = y_value

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

    vect = CountVectorizer()
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)

    X_test_dtm = vect.transform(X_test)

    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    
    print 'Accuracy: '
    print metrics.accuracy_score(y_test, y_pred_class)
    
    print 'Null Accuracy: '
    print y_test.value_counts().head(1) / len(y_test)
    
    print 'Confusion Matrix: '
    print metrics.confusion_matrix(y_test, y_pred_class)
Example #6
0
class Data:
    def __init__(self, min_df):
        self.sv = CountVectorizer()
        self.rv = CountVectorizer()
        self.dv = CountVectorizer(max_df=1.0, min_df=min_df)

    def load(self, pattern, train=False):
        docs = list()
        receivers = list()
        senders = list()
        for path in glob.glob("data/" + pattern):
            with open(path + "/parties.txt") as f:
                senders.append(f.readline())
                receivers.append(f.readline())
            with open(path + "/words.csv") as f:
                f.readline()
                docs.append(" ".join([line.rstrip().split(",")[5] for line in f]))

        if train:
            self.dv.fit(docs)
            self.sv.fit(senders)
            self.rv.fit(receivers)

        D = self.dv.transform(docs)
        S = self.sv.transform(senders)
        R = self.rv.transform(receivers)

        return D, S, R
def preprocesar(labeled, unlabeled, dims, stop_words=None):
    """preprocesar."""

    instances = []
    labels = []
    for v_l in labeled.values():
        instances += v_l['X']
        labels += v_l['y']

    if unlabeled is not None:
        for v_ul in unlabeled.values():
            instances += v_ul['X']

    x_cv = CountVectorizer(max_features=dims, ngram_range=(1, 2), binary=True, stop_words=stop_words)
    x_cv.fit(instances)

    y_cv = CountVectorizer()
    y_cv.fit(labels)

    print "\nEtiquetas:"

    for etiqueta, valor in y_cv.vocabulary_.items():
        print "\tEtiqueta: %s - Valor: %d" % (etiqueta, valor)
    print ""

    for d_l in labeled:
        labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
        labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

    if unlabeled is not None:
        for d_ul in unlabeled:
            unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X'])

    return labeled, unlabeled
Example #8
0
def featTransform(sents_train, sents_test):
    cv = CountVectorizer()
    cv.fit(sents_train)
    print(cv.get_params())
    features_train = cv.transform(sents_train)
    features_test = cv.transform(sents_test)
    return features_train, features_test, cv
def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
def prep_train_evaluate(docs_train, docs_test, labs_train, labs_test, **kwargs):
  '''func to prep text, extract features, train model, predict, evaluate'''

  # instantiate vectorizer + classifier 
  vectorizer = CountVectorizer(token_pattern=r'\b[a-zA-Z0-9_<>]{1,}\b', 
                               **kwargs)
  classifier = LogisticRegression(solver='liblinear')

  # construct feature matrices for train and test sets 
  vectorizer.fit(docs_train)
  X_train = vectorizer.transform(docs_train)
  X_test = vectorizer.transform(docs_test)

  # fit/train classifier using train features and labels 
  classifier.fit(X_train, labs_train)

  # generate test set model predictions from test matrix 
  preds_test = classifier.predict(X_test)

  # measure performance using simple accuracy (proportion correct) 
  accuracy = accuracy_score(labs_test, preds_test)

  # print lil message showing param settings + performance 
  print(f'  >> test set accuracy: {accuracy:.3f}\n({kwargs})\n')

  # return classifier, vectorizer, predictions, and score for inspection 
  return {'clf': classifier, 'vect': vectorizer, 
          'preds': preds_test, 'acc': accuracy}
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def clf_event_running_wordVec(path, event, name_clf, X, X_vec, Y, clf, K, command, call):
    if command == "StratifiedKFold":
        cv = StratifiedKFold(Y, K)
    else:
        print "Need a correct command"
        quit()

    X_vec_norm = preprocessing.normalize(X_vec, norm="l2")
    for traincv, testcv in cv:
        X_train, X_test = X[traincv], X[testcv]
        X_vec_train, X_vec_test = X_vec_norm[traincv], X_vec_norm[testcv]
        y_train, y_test = Y[traincv], Y[testcv]

        MIN_DF = 2
        vec = CountVectorizer(lowercase=True, min_df=2)
        vec = vec.fit(X_train)

        X_train_trans, X_test_trans = vec.transform(X_train), vec.transform(X_test)
        X_train_trans_all, X_test_trans_all = hstack([X_train_trans, X_vec_train]), hstack([X_test_trans, X_vec_test])

        # print X_vec_train.shape, X_vec_test.shape
        # print X_train_trans.shape, X_test_trans.shape
        # print X_train_trans_all.shape, X_test_trans_all.shape

        clf.fit(X_train_trans_all, y_train)  # training model
        y_test_pred = clf.predict(X_test_trans_all)

        matrix = confusion_matrix(y_test_pred, y_test)
        for value in matrix:
            line = ""
            for each in value:
                line = line + str(each) + "\t"
            print line.strip()
        print "----------------"
def event_pred_model(event, X, Y, command):  # note that X is a list and Y is a array
    texts = load_demo_text(command)
    total_X = X + texts
    X_convert, X_pred, total_X_convert = np.array(X), np.array(texts), np.array(total_X)

    MIN_DF = 2
    vec = CountVectorizer(lowercase=True, min_df=MIN_DF)
    vec = vec.fit(total_X_convert)

    X_convert_trans, X_pred_trans = vec.transform(X_convert), vec.transform(X_pred)

    clf.fit(X_convert_trans, Y)  # training model
    y_pred = clf.predict(X_pred_trans)
    y_prob = clf.decision_function(X_pred_trans)

    max_prob, min_prob = max(y_prob), min(y_prob)
    list_write = list()
    for i in range(0, len(y_pred)):
        prob = (y_prob[i] - min_prob) / (max_prob - min_prob)
        print y_pred[i], prob, texts[i]

        # list_write.append(str(y_pred[i]) + '\t' + texts[i])
        list_write.append(str(y_pred[i]))

    if command == 'twitter':
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter/events_pred'
        write_file(path_write, event, list_write)
def main():
    '''
    Open text file storing examples of greetings to train the SVM classifier
    Use feature extraction to create bag of words represented as vectors
    '''
    
    vectorizer = CountVectorizer()
    
    #read data and store in variables
    train_data = [line.rstrip('\n') for line in open('text/dataset1.txt')]
 
    with open ("text/train_set.txt", "r") as myfile:
        test_data = myfile.read()
   
    target_names = ['pos', 'neg']
    Y_train = buildLabels() 


    counts = vectorizer.fit_transform(train_data)
    print len(counts.data)
    print train_data
 
    vectorizer = TfidfVectorizer() 
    train_vectors = vectorizer.transform(train_data)
    test_vectors = vectorizer.transform(test_data)  

    '''
    initalize and use the svm classifier
    '''
    classifier_rbf = svm.SVC()
    classifier_rbf.fit(train_vectors, Y_train)
    prediction_rbf = classifier_rbf.predict(test_vectors)

    for item, labels in zip(test_vectors, predicted):
        print '%s => %s' % (item, ', '.join(target_names[x] for x in labels))
def load_data(do_plots=False):
    traindf = pd.read_csv('labeledTrainData.tsv.gz', compression='gzip', delimiter='\t', header=0, quoting=3)
    testdf = pd.read_csv('testData.tsv.gz', compression='gzip', delimiter='\t', header=0, quoting=3)
    #unlabeled_traindf = pd.read_csv('unlabeledTrainData.tsv.gz', compression='gzip', delimiter='\t', header=0, quoting=3)
    word_count_df = pd.read_csv('word_count.csv.gz', compression='gzip')

    cond0 = word_count_df['count'] > 500
    cond1 = word_count_df['frac'] > 0.1
    biased_word_list = {w: n for n, w in enumerate(list(word_count_df[cond0 & cond1]['word']))}

    traincleanreview = traindf['review'].apply(clean_review)
    testcleanreview = testdf['review'].apply(clean_review)
    vectorizer = CountVectorizer(analyzer='word', vocabulary=biased_word_list)
    trainwvector = vectorizer.transform(traincleanreview).toarray()
    testwvector = vectorizer.transform(testcleanreview).toarray()

    #traindf['wvector'] = traindf['review'].apply(clean_review)
    #testdf['wvector'] = testdf['review'].apply(clean_review)

    traindf = traindf.drop(labels=['review'], axis=1)
    testdf = testdf.drop(labels=['review'], axis=1)

    print traindf.shape, testdf.shape
    print traindf.columns
    print testdf.columns

    xtrain = trainwvector
    ytrain = traindf['sentiment'].values
    xtest = testwvector
    ytest = testdf['id'].values

    return xtrain, ytrain, xtest, ytest
Example #17
0
def classify(sumDes, ntags, t_sumDes, t_ntags, subject):

    xna = sumDes[ ntags == 'NA']
    xthing = sumDes[ntags == subject]
    xnthing = sumDes[ (ntags != 'NA') & (ntags != subject)]

    t_xna = t_sumDes [ t_ntags == 'NA' ]
    t_xthing = t_sumDes[ t_ntags == subject]
    t_xnthing = t_sumDes[ (t_ntags != 'NA') & (t_ntags != subject)]

    X = np.hstack((xthing, xnthing, t_xthing, t_xnthing))
    y = np.hstack((np.ones(xthing.shape[0]), np.zeros(xnthing.shape[0]), np.ones(t_xthing.shape[0]), np.zeros(t_xnthing.shape[0])))

    vectorizer = CountVectorizer(max_features=100, stop_words="english", strip_accents="ascii")
    X = vectorizer.fit_transform(X).toarray()
    xna = vectorizer.transform(xna).toarray()
    t_xna = vectorizer.transform(t_xna).toarray()

    from sklearn.ensemble import ExtraTreesClassifier
    pred = (ExtraTreesClassifier().fit(X,y).predict_proba(xna)[:,1] > 0.95).astype(str)
    t_pred = (ExtraTreesClassifier().fit(X,y).predict_proba(t_xna)[:,1] > 0.95).astype(str)
    
    print "Transformed ---> ", pred[pred == "True"].shape[0]
    pred[ pred == 'True'] = subject
    t_pred[ t_pred == 'True'] = subject
    pred[ pred == 'False' ] = 'NA'
    t_pred[ t_pred == 'False' ] = 'NA'

    ntags[ ntags == 'NA'] = pred
    t_ntags[ t_ntags == 'NA'] = t_pred
    
    return ntags, t_ntags
Example #18
0
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")
class NaiveBayesClassifier(object):
    '''
    classdocs
    '''
        
    def __init__(self):
        
        self.classifier = MultinomialNB()
        #self.model = None
        
    def trainClassifier(self, trainingDocs, labels):
        self.trainingDocs = trainingDocs
        self.labels = labels
        
        self.count_vect = CountVectorizer(stop_words='english')
        X_train_counts = self.count_vect.fit_transform(self.trainingDocs)
        self.tf_transformer = TfidfTransformer(use_idf=True,sublinear_tf=True).fit(X_train_counts)
        X_train_tf = self.tf_transformer.transform(X_train_counts)
        
        self.ch2 = SelectKBest(chi2)
        X_train = self.ch2.fit_transform(X_train_tf, self.labels)
        
        #self.classifier.fit(X_train_tf, self.labels)
        self.classifier.fit(X_train, self.labels)
        
    def classify(self, docs_new):
        X_new_counts = self.count_vect.transform(docs_new)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        #predicted = self.model.predict(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        #for doc, category in zip(docs_new, self.predicted):
        #    print '%r => %s' % (doc,category)
        return self.predicted
    
    def calculate_score(self, doc_new):
        doc_list = [doc_new]
        #doc_list.append(doc_new)
        X_new_counts = self.count_vect.transform(doc_list)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        return self.predicted
        #predicted_prob_all = self.classifier.predict_proba(X_test)
        #predicted_prob = [max(pr) for pr in predicted_prob_all]
        #return predicted_prob
    
    def score(self,docs_test,labels):
        X_new_counts = self.count_vect.transform(docs_test)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        
        X_test = self.ch2.transform(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        accuracy = np.mean(self.predicted == labels)
        #accuracy = self.classifier.score(X_new_tfidf, labels)
        return accuracy
Example #20
0
def train_test(args):
    
    # unpack arguments and make train/test data/label dicts/lists
    train, test, features, classifier = args

    # create tf idf spare matrix from training data
    if features == 'tfidf':
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=1290)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'dict':
        fe = CountVectorizer(tokenizer=tokenize, stop_words='english', binary=True)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'lsa':
        svd = TruncatedSVD(n_components=100, random_state=42)
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.115, max_features=11500)
        trainfe = svd.fit_transform(fe.fit_transform(train['data']))
    elif features == 'rule':
        hamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        spamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        hamfit = hamfe.fit_transform(train['data'].loc[train['labels'] == 0])
        spamfit = spamfe.fit_transform(train['data'].loc[train['labels'] == 1])

    # train multinomial nb classifier on training data
    if classifier == 'mnb':
        from sklearn.naive_bayes import MultinomialNB
        clf = MultinomialNB().fit(trainfe, train['labels'])
    elif classifier == 'gnb':
        from sklearn.naive_bayes import GaussianNB
        clf = GaussianNB().fit(trainfe.toarray(), train['labels'])
    elif classifier == 'svm':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='squared_hinge', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'log':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='log', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'rule':
        hamfeats = hamfe.transform(test['data'])
        spamfeats = spamfe.transform(test['data'])
        hyp = np.array(hamfeats.sum(axis=1) < spamfeats.sum(axis=1)).reshape(-1).T
        
    # extract features from test data
    if features == 'lsa':
        feats = svd.transform(fe.transform(test['data']))
    else:
        feats = fe.transform(test['data'])
    # use trained classifier to generate class predictions from test features
    if classifier == 'gnb':
        hyp = clf.predict(feats.toarray())
    elif classifier == 'rule':
        pass
    else:
        hyp = clf.predict(feats)

    # compare predictions with test labels
    score = np.mean(hyp == test['labels'])

    return score
Example #21
0
def tfidf_normalize(articles_with_id):
    global NON_STOPWORD_LIMIT
    stemmed_articles_with_id = [(aid, stem_article(article)) for (aid, article) in articles_with_id]
    stemmed_articles = [article for (aid, article) in stemmed_articles_with_id]
    # test_set = train_set
    # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer
    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    # by appling the vectorizer instance to the train set
    # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df
    # documents in the train_set
    vectorizer.fit_transform(stemmed_articles)
    # vectorizer transform will apply the vocabulary from the train set to the test set. In my case,
    # they are the same set: whole Wikipedia.
    # this means that each article will get representation based on the words from the vocabulary and
    # their TF-IDF values in the Scipy sparse output matricx
    freq_term_matrix = vectorizer.transform(stemmed_articles)
    long_articles_with_id = []
    assert freq_term_matrix.shape[0] == len(articles_with_id)
    for (i, article_with_id) in zip(xrange(freq_term_matrix.shape[0]), stemmed_articles_with_id):
        row = freq_term_matrix.getrow(i)
        if row.getnnz() >= NON_STOPWORD_LIMIT:
            long_articles_with_id.append(article_with_id)

    long_articles = [article for (aid, article) in long_articles_with_id]

    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    vectorizer.fit_transform(long_articles)

    freq_term_matrix = vectorizer.transform(long_articles)

    # Gabrilovich says that they threshold TF on 3 (remove word-article association if that word
    # does not appear at least 3 times in that single article
    # freq_term_matrix.data *= freq_term_matrix.data>=3
    # freq_term_matrix.eliminate_zeros() # I think this is not necessary...
    # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is
    # how he defines TF values. In case of TF = 0, this shall not affect such value
    # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data )
    # instantiate tfidf trnasformer
    tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True)
    # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary)
    tfidf.fit(freq_term_matrix)
    # finally, tfidf will calculate TFIDF values with transform()
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    # tf_idf_matrix.data = np.log(np.log(tf_idf_matrix.data))
    tf_idf_matrix = normalize(tf_idf_matrix, norm="l2", axis=0, copy=False)
    # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to
    # words' concept vectors)
    tf_idf_matrix = tf_idf_matrix.tocsc()
    # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the
    # dictionary and put them to new dictionary word_index
    word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
    M, N = tf_idf_matrix.shape
    print "Articles: ", M
    print "Words: ", N
    return tf_idf_matrix, word_index, long_articles_with_id
def build_cv_or_tfidf(in_neg_df, in_pos_df,
                      cv_or_tfidf='CV', nlp_params={},
                      info_thresh=None):
    # get data ready
    neg_df = in_neg_df.copy()
    pos_df = in_pos_df.copy()
    all_content = pd.concat([pos_df, neg_df])

    # fit CountVectorizer
    if cv_or_tfidf == 'CV':
        cv_or_tf = CountVectorizer(**nlp_params)
    else:
        cv_or_tf = TfidfVectorizer(**nlp_params)
    cv_or_tf.fit(all_content)

    # including this because filtering based on information threshold
    # becomes too computationally punishing with larger ngrams
    if 'ngram_range' in nlp_params and nlp_params['ngram_range'] != (1, 1):
        return cv_or_tf

    # transform positive and negative content
    neg_cv = pd.DataFrame(
        cv_or_tf.transform(neg_df).todense(), columns=cv_or_tf.vocabulary_)
    pos_cv = pd.DataFrame(
        cv_or_tf.transform(pos_df).todense(), columns=cv_or_tf.vocabulary_)

    # count up words
    neg_counts = np.sum(neg_cv, axis=0)
    # to avoid division by 0 when calculating ratios (below)
    neg_counts = neg_counts + 1
    pos_counts = np.sum(pos_cv, axis=0)
    pos_counts = pos_counts + 1

    all_counts = pd.DataFrame(neg_counts, columns=['Negative'])
    all_counts['Positive'] = pos_counts

    # calculate ratios
    all_counts['Neg_Pos'] = (
        1.0 * all_counts['Negative']) / all_counts['Positive']
    all_counts['Pos_Neg'] = (
        1.0 * all_counts['Positive']) / all_counts['Negative']

    if info_thresh is not None:
        all_counts = all_counts[
            (all_counts['Neg_Pos'] >= info_thresh) |
            (all_counts['Pos_Neg'] >= info_thresh)]

    # remake CV/TF-IDF with pruned vocabulary (this could be its own function)
    nlp_params['vocabulary'] = all_counts.index
    if cv_or_tfidf == 'CV':
        cv_or_tf = CountVectorizer(**nlp_params)
    else:
        cv_or_tf = TfidfVectorizer(**nlp_params)

    return cv_or_tf
Example #23
0
def train_and_predict(X_train,X_test,y_train):	
	count_vect = CountVectorizer().fit(X_train)
	X_train_counts = count_vect.transform(X_train)
	X_test_counts = count_vect.transform(X_test)

	clf = MultinomialNB().fit(X_train_counts, y_train)
	y_test = clf.predict_proba(X_test_counts)
	y_final = y_test.argmax(1)
	y_final[numpy.where(y_test.max(1)<0.5)]=-1

	return y_final
Example #24
0
class Extractor():

	def __init__(self):
		self.vectorizer = CountVectorizer(tokenizer=filter_tokenise)
		
	def opt_cfg(self,p):
		p.add_option("-K","--select-k-best",metavar = "DIM",
			dest    = "dimension", type = "int", default = 20,
			action  = "store",
			help    = "K best attributes")
	
	def train(self, records):
		self.DIM = DIM = 5
		self.feature_selector = SelectKBest(f_regression,k = DIM)
		count = sum(1 for _ in records.reset())
		print "Fitting vectorizer..."	
		self.vectorizer.fit(' '.join(window[2]) for window,_ in records.reset())
		y_vec = np.array([float(d_t) for _,d_t in records.reset()], dtype=np.float64)
		fs = self.feature_selector
		print "Fitting feature selector..."
		fs.fit(
			memmapify(
				(self.vectorizer.transform([' '.join(window[2])]).toarray()[0]
					for window,_ in records.reset()),
				dtype = np.float64,
				length = count
			),
			y_vec
		)
	def save(self):
		save_model('extractor',self)
	
	def finalise(self):
		DIM = self.DIM
		top_weights = list(np.argsort(self.feature_selector.scores_)[-DIM:])
		fn = self.vectorizer.get_feature_names()
		tokens  = [fn[i] for i in top_weights]
		tokens.reverse()
		self.vocab = tokens
	def extract(self,window):
		x = self.vectorizer.transform([' '.join(window[2])])
		x = x.toarray()
		l = np.sum(x)
		x = self.feature_selector.transform(x)
		#x = x.toarray()
		x = np.append(x,
				[
					l
				]
			)
		

		return x
class OneClassClassifier(object):
    '''
    classdocs
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.classifier = svm.OneClassSVM( kernel="rbf", gamma=0.0)#(nu=0.1, kernel="rbf", gamma=0.1)
        
    def trainClassifier(self, trainingDocs,labels):
        #self.trainingDocs = trainingDocs
        #self.labels = labels
        
        self.count_vect = CountVectorizer(stop_words='english')
        #X_train_counts = self.count_vect.fit_transform(self.trainingDocs)
        X_train_counts = self.count_vect.fit_transform(trainingDocs)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
        #self.tf_transformer = TfidfTransformer().fit(X_train_counts)
        X_train_tf = self.tf_transformer.transform(X_train_counts)
        
        self.ch2 = SelectKBest(chi2,k=100)
        X_train = self.ch2.fit_transform(X_train_tf, labels)
        
        #self.classifier.fit(X_train_tf, self.labels)
        self.classifier.fit(X_train)
    
    def calculate_score(self, doc_new):
        doc_list = [doc_new]
        #doc_list.append(doc_new)
        X_new_counts = self.count_vect.transform(doc_list)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        #X_test = self.ch2.transform(X_new_tfidf)
        X_test = X_new_tfidf
        self.predicted = self.classifier.predict(X_test)
        return self.predicted
    
    def score(self,docs_test,labels):
        '''
        Here labels are 1 and -1
        '''
        X_new_counts = self.count_vect.transform(docs_test)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        
        X_test = self.ch2.transform(X_new_tfidf)
        #X_test = X_new_tfidf
        self.predicted = self.classifier.predict(X_test)
        print self.predicted
        accuracy = np.mean(self.predicted == labels)
        #accuracy = self.classifier.score(X_new_tfidf, labels)
        return accuracy
def build_tfidf(train_data, test_data):
    stops = stopwords.words('english')
    counter = CountVectorizer(tokenizer=StemTokenizer(),
                              stop_words=stops, min_df=3,
                              dtype=np.double)
    counter.fit(train_data)
    train_tf = counter.transform(train_data)
    test_tf = counter.transform(test_data)
    transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
    train_tfidf = transformer.fit_transform(train_tf)
    test_tfidf = transformer.transform(test_tf)
    return train_tfidf, test_tfidf
Example #27
0
def perform_emsamble_model():
    #get data from csv file
    x , y_votes, y_comments, y_views, lat = read_train_data()
    #transform to nunpy data type array for better usage
    y_votes = np.array(y_votes)
    y_comments = np.array(y_comments)
    y_views = np.array(y_views)
    #get test data
    x_test, ids, lat = read_test_data()
    #Change the parameters from the objects with the values from gridsearch
    vec_votes = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    vec_comments = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    vec_views = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2)
    #transfor x and x_test in a TFIDF matrix for feeding to the classifier
    x_votes = vec_votes.fit_transform(x)
    x_comments = vec_comments.fit_transform(x)
    x_views = vec_views.fit_transform(x)
    x_test_transformed_votes = vec_votes.transform(x_test)
    x_test_transformed_comments = vec_comments.transform(x_test)
    x_test_transformed_views = vec_views.transform(x_test)
    print "TFIDF Matrixes generated"
    print " LSA transforming"
    lsa_votes = TruncatedSVD(500)
    lsa_comments = TruncatedSVD(500)
    lsa_views = TruncatedSVD(500)
    x_votes = lsa_votes.fit_transform(x_votes)
    print "LSA Votes Done.."
    print
    x_comments = lsa_comments.fit_transform(x_comments)
    print "LSA Comments Done.."
    print
    x_views = lsa_views.fit_transform(x_views)
    print "LSA Views Done.."
    print
    x_test_transformed_votes = lsa_votes.transform(x_test_transformed_votes)
    x_test_transformed_comments = lsa_comments.transform(x_test_transformed_comments)
    x_test_transformed_views = lsa_views.transform(x_test_transformed_views)
    print "SLA Finished.."
    ada_votes = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_comments = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_views = AdaBoostClassifier(base_estimator=RandomForestClassifier())
    ada_votes.fit(x_votes, y_votes)
    ada_comments.fit(x_comments, y_comments)
    ada_views.fit(x_views, y_views)
    print "Fitting done"
    print
    #predict number of votes 
    pred_votes = ada_votes.predict(x_test_transformed_votes)
    pred_comments = ada_comments.predict(x_test_transformed_comments)
    pred_views = ada_views.predict(x_test_transformed_views)
    #generate submission response csv file
    create_csv_response(len(x_test), ids, pred_views, pred_votes, pred_comments)
Example #28
0
class TextClassifier():

    def __init__(self, classifier_type='logistic', vocabulary=None, ngram_range=(1, 1), stemming=True):
        self._vocabulary = not vocabulary is None
        self._stemming = stemming
        self._stemmer = SnowballStemmer('english')
        self._vectorizer = CountVectorizer(vocabulary=vocabulary,
                                           ngram_range=ngram_range,
                                           encoding='utf8',
                                           stop_words=stopwords)
        try:
            self._classifier = classifiers[classifier_type]()
        except KeyError:
            print("Classifier type must be one of '%s'" % ", ".join(classifiers))

    def fit(self, X, y):
        """
        Fit the model
        :param X: a list of texts
        :param y: a list of labels
        :return: self
        """
        text_gen = add_negation((self._stem(text) for text in X))
        if self._vocabulary:
            _X_train = self._vectorizer.transform(text_gen)
        else:
            _X_train = self._vectorizer.fit_transform(text_gen)
        self._classifier.fit(_X_train, y)
        return self

    def predict(self, X):
        X = self._vectorize(X)
        return self._classifier.predict(X)

    def _stem(self, text):
        """
        Stem words in a single document
        :param text: a string
        :return: string of stemmed words
        """
        text = text.lower()
        text = ' '.join(map(self._stemmer.stem, text.split()))
        return text

    def _vectorize(self, X):
        """
        Vectorize the input X
        :param X: an iterable of documents
        :return: the sparse matrix representation of X
        """
        text_gen = add_negation((self._stem(text) for text in X))
        return self._vectorizer.transform(text_gen)
def clf_event_one_one(clf, X_train, Y_train, X_test, first_index, second_index):
    vec = CountVectorizer(lowercase=True, min_df=2)
    vec = vec.fit(X_train)

    X_train_trans = vec.transform(X_train)
    X_test_trans = vec.transform(X_test)

    # transformer = TfidfTransformer()
    # X_train_trans, X_test_trans = transformer.fit_transform(X_train_trans), transformer.fit_transform(X_test_trans)

    clf.fit(X_train_trans, Y_train)  # training model
    y_test_pred = convert_test_pred(clf.predict(X_test_trans), first_index, second_index)
    return y_test_pred
Example #30
0
def generate_submission_combined_text_lat_long():
    #get data from csv file
    x , y_votes, y_comments, y_views, lat_train = read_train_data()
    #transform to nunpy data type array for better usage
    y_votes = np.array(y_votes)
    y_comments = np.array(y_comments)
    y_views = np.array(y_views)
    lat_train = np.array(lat_train)
    #get test data
    x_test, ids, lat_test = read_test_data()
    lat_test = np.array(lat_test)
    #Change the parameters from the objects with the values from gridsearch
    #vec_votes = TfidfVectorizer(ngram_range=(1, 2), use_idf=False, stop_words=None, strip_accents='unicode', min_df=3, max_features=500, max_df=5000)
    #vec_comments = TfidfVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), use_idf=False, min_df=3)
    #vec_views = TfidfVectorizer(stop_words=None, strip_accents='unicode', use_idf=False, ngram_range=(1, 2), min_df=3)
    vec_votes = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2, max_features=None, max_df=1000)
    vec_comments = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=2, max_features=100, max_df=500)
    vec_views = CountVectorizer(stop_words=None, strip_accents='unicode',analyzer='word',ngram_range=(1, 2), min_df=1, max_features=None, max_df=500)
    #transfor x and x_test in a TFIDF matrix for feeding to the classifier
    x_votes = vec_votes.fit_transform(x)
    x_comments = vec_comments.fit_transform(x)
    x_views = vec_views.fit_transform(x)
    x_test_transformed_votes = vec_votes.transform(x_test)
    x_test_transformed_comments = vec_comments.transform(x_test)
    x_test_transformed_views = vec_views.transform(x_test)
    print "TFIDF Matrixes generated"
    print
    print "performing data union"
    #X_all_train =  hstack((x.as_matrix(), x_text))
    x_votes =  hstack((lat_train.shape, x_votes))
    x_comments =  hstack((lat_train.shape, x_comments))
    x_views =  hstack((lat_train.shape, x_views))
    x_test_transformed_votes =  hstack((lat_test.shape, x_test_transformed_votes))
    x_test_transformed_comments =  hstack((lat_test.shape, x_test_transformed_comments))
    x_test_transformed_views =  hstack((lat_test.shape, x_test_transformed_views))
    #Create 1 linear classifier for each target variable 
    #num_votes_clf, num_comments_cfl, num_views_cfl = SGDRegressor(alpha=0.00001), SGDRegressor(alpha=0.00001), SGDRegressor(alpha=0.001)
    num_votes_clf, num_comments_cfl, num_views_cfl = BernoulliNB(fit_prior=True, alpha=0.001), BernoulliNB(fit_prior=True, alpha=0.01), BernoulliNB(fit_prior=True, alpha=0.1)
    print "Fitting models.."
    num_votes_clf.fit(x_votes, y_votes)
    num_comments_cfl.fit(x_comments, y_comments)
    num_views_cfl.fit(x_views, y_views)
    print "Fitting done"
    print
    gc.collect()
    #predict number of votes 
    pred_votes = num_votes_clf.predict(x_test_transformed_votes)
    pred_comments = num_comments_cfl.predict(x_test_transformed_comments)
    pred_views = num_views_cfl.predict(x_test_transformed_views)
    #generate submission response csv file
    create_csv_response(len(x_test), ids, pred_views, pred_votes, pred_comments)
    return custom_stopwords_list


stop_words_file = '哈工大停用词表.txt'

stopwords = get_custom_stopwords(stop_words_file)

vect = CountVectorizer(max_df=0.8,
                       min_df=3,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                       stop_words=frozenset(stopwords))
#print(vect)
test = pd.DataFrame(vect.fit_transform(X_train).toarray(),
                    columns=vect.get_feature_names())
test.head()

nb = MultinomialNB()

X_train_vect = vect.fit_transform(X_train)

nb.fit(X_train_vect, y_train)
train_score = nb.score(X_train_vect, y_train)
print('准确率', train_score)

# X_test_vect = vect.transform(X_test)
# print(nb.score(X_test_vect, y_test))

X_vec = vect.transform(X)
nb_result = nb.predict(X_vec)
data['nb_result'] = nb_result
def main():
    # The file path where to find the data
    path_folder = "data/"

    # Opening metadata
    meta_data = pd.read_csv(path_folder + "Tobacco3482.csv")

    # Here I'm extracting the labels
    labels = np.unique(meta_data["label"])

    # Opening the data
    x = []
    y = []
    label_classes = {}
    i = 0
    for label in labels:
        path = path_folder + label + "/*.txt"
        print("Opening " + label + " data")
        files = glob.glob(path)
        for file in files:
            file_tmp = open(file, 'r')
            x.append(file_tmp.read())
            y.append(label)
            file_tmp.close()
        label_classes[i] = label
        i += 1
    print("Opened " + str(len(x)) + " documents, " + str(len(np.unique(y))) +
          " different classes")

    # Here I'm extracting the label
    labels = np.unique(meta_data["label"])

    # Treating the labels
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Splitting the data into train and test
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    # Transforming the data into token representation
    vectorizer = CountVectorizer()
    vectorizer.fit(x_train)

    x_train_counts = vectorizer.transform(x_train)
    x_test_counts = vectorizer.transform(x_test)

    # Bayesian part

    # Creation of the model
    clf = MultinomialNB()
    print("Training Bayesian for baseline")
    # Training
    clf.fit(x_train_counts, y_train)

    print("Printing results for Bayesian")
    # Printing of the results
    print("Accuracy score : ")
    print(clf.score(x_test_counts, y_test))
    y_pred = clf.predict(x_test_counts)
    print("Confusion matrix :")
    print(confusion_matrix(y_test, y_pred))
    print("Classification report :")
    print(classification_report(y_test, y_pred))
    print("Where classes are :")
    for label in label_classes:
        print(str(label) + " : " + label_classes[label])

    # Neural Network part
    # creation of the callbacks to save the best model

    checkpointer = ModelCheckpoint(filepath="weights.hdf5",
                                   verbose=1,
                                   save_best_only=True)
    callbacks = [checkpointer]

    # Extracting the size of the data
    dimension_data = len(x_train_counts.toarray()[0])

    # Creation of the model
    NN = model_creation(dimension_data)

    print("Training neural network, this may take while")
    # Training of the data
    NN.fit(x_train_counts.toarray(),
           to_categorical(y_train),
           epochs=10,
           validation_split=0.1,
           batch_size=128,
           callbacks=callbacks)

    # Loading the best model
    NN.load_weights('weights.hdf5')

    print("Printing neural network results")
    # Printing the results
    print("Accuracy score :")
    print(NN.evaluate(x_test_counts.toarray(), to_categorical(y_test))[1])

    print("Confusion matrix :")
    confusion_matrix_NN(NN, x_test_counts.toarray(), to_categorical(y_test))

    print("Classification report :")
    y_pred = NN.predict(np.array(x_test_counts.toarray()))
    y_test_class = np.argmax(to_categorical(y_test), axis=1)
    y_pred_class = np.argmax(y_pred, axis=1)
    print(classification_report(y_test_class, y_pred_class))

    print("Where classes are :")
    for label in label_classes:
        print(str(label) + " : " + label_classes[label])

    print(
        "The model is trained and the weights are saved at weights.hdf5, closing script"
    )
Example #33
0
def main():
    #setting up data
    train_path = 'msd_genre_dataset.csv'
    train_df = pd.read_csv(train_path)
    print(train_df.head())

    #splitting dataset and adding labels
    metal_df = train_df[train_df['genre'] == 'metal']
    met_len = len(metal_df.index)
    met = [1] * met_len
    metal_df['label'] = met
    metal_df = metal_df.sample(434)

    classical_df = train_df[train_df['genre'] == 'classical']
    classical_len = len(classical_df.index)
    classical = [0] * classical_len
    classical_df['label'] = classical
    classical_df = classical_df.sample(434)

    rock_df = train_df[train_df['genre'] == 'classic pop and rock']
    rock_len = len(rock_df.index)
    rock = [2] * rock_len
    rock_df['label'] = rock
    rock_df = rock_df.sample(434)

    pop_df = train_df[train_df['genre'] == 'pop']
    pop_len = len(pop_df.index)
    pop = [3] * pop_len
    pop_df['label'] = pop
    pop_df = pop_df.sample(434)

    reggae_df = train_df[train_df['genre'] == 'soul and reggae']
    reggae_len = len(reggae_df.index)
    reggae = [4] * reggae_len
    reggae_df['label'] = reggae
    reggae_df = reggae_df.sample(434)

    country_df = train_df[train_df['genre'] == 'folk']
    country_len = len(country_df.index)
    country = [5] * country_len
    country_df['label'] = country
    country_df = country_df.sample(434)

    hiphop_df = train_df[train_df['genre'] == 'hip-hop']
    hiphop_len = len(hiphop_df.index)
    hiphop = [6] * hiphop_len
    hiphop_df['label'] = hiphop

    jnb_df = train_df[train_df['genre'] == 'jazz and blues']
    jnb_len = len(jnb_df.index)
    jnb = [7] * jnb_len
    jnb_df['label'] = jnb
    jnb_df = jnb_df.sample(434)

    disco_df = train_df[train_df['genre'] == 'dance and electronica']
    disco_len = len(disco_df.index)
    disco = [8] * disco_len
    disco_df['label'] = disco
    disco_df = disco_df.sample(434)

    frames = [
        metal_df, classical_df, rock_df, pop_df, reggae_df, country_df,
        hiphop_df, jnb_df, disco_df
    ]
    df = pd.concat(frames)

    print('-------------------------------------')
    print('-------------------------------------')
    print('-----------9 Genre Model-------------')
    print('-------------------------------------')
    print('-------------------------------------')
    print("Generating Model based on Artist Name")
    #Transform names to integers (bag of words)
    vectorizer = CountVectorizer().fit(df['artist_name'])

    #turn names into count vectors
    x = vectorizer.transform(df['artist_name'])

    y = df['label']

    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

    # instantiate the model as clf(classifier) and train it
    clf = MultinomialNB()
    clf.fit(xtrain, ytrain)
    acc = clf.score(xtest, ytest)
    print("Artist Accuracy on MSD Dataset: ", acc)

    #Our data set
    test_df = build_dataset()
    xtest = vectorizer.transform(test_df['artist_name'])
    ytest = test_df['label']
    acc = clf.score(xtest, ytest)
    print("Artist Accuracy on our dataset: ", acc)

    print('-------------------------------------')

    print("Generating Model based on Song Title")

    #Transform names to integers (bag of words)
    vectorizer = CountVectorizer().fit(df['title'])

    #turn names into count vectors
    x = vectorizer.transform(df['title'])

    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

    # instantiate the model as clf(classifier) and train it
    clf_title = MultinomialNB()
    clf_title.fit(xtrain, ytrain)
    acc = clf_title.score(xtest, ytest)
    print("Song Title Accuracy on MSD Dataset:", acc)

    xtest = vectorizer.transform(test_df['title'])
    ytest = test_df['label']
    acc = clf_title.score(xtest, ytest)
    print("Song Title Accuracy on our Dataset:", acc)
    print('-------------------------------------')

    ##################################################################################################################################################

    features = ['tempo', 'time_signature', 'loudness', 'key', 'duration']

    print("Generating Model based on", features)
    all_features = chain.from_iterable(
        combinations(features, r) for r in range(1,
                                                 len(features) + 1))

    max_accuracys = []
    for feature_list in tqdm(all_features):
        feature = list(feature_list)

        #x = df[['tempo','time_signature','loudness','key','duration']]
        x = df[feature]
        y = df['label']

        xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

        max_acc = -1
        for k in range(3, 250, 5):
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(xtrain, ytrain)
            acc = knn.score(xtest, ytest)
            if acc > max_acc:
                max_acc = acc
        max_accuracys.append((max_acc, feature))
    acc = [item[0] for item in max_accuracys]
    feat = [item[1] for item in max_accuracys]

    print("KNN Model based on ", features, "\nAccuracy on MSD dataset:",
          max(acc), "-- uses:", feat[acc.index(max(acc))])
Example #34
0
class Enrich:
    '''
    Collection of methods used to enrich initial dictionary (handcrafted by mix and match) using information from corpus
    
    Args:
        dictionary (dataframe): initial dictionary
        genre (array): genres of interest 

    Attributes:
        genre (array): genres of interest 
        dict1 (dictionary): initial dictionary as a dictionary instead of a dataframe
        seed_topic_list (list): list of seed topics for seeded topic modeling from initial dictionary
        vectorizer (sklearn object): CountVectorizer on pre-processed corpus
        tf_dtm (array): term frequency document-term matrix 
        vocab (array): array of feature names in document-term matrix
        new_topic_keywords(list): additional new keywords that converged around seeded topics after seeded topic modeling		
    '''
    def __init__(self, dictionary, genre):
        self.genre = genre
        self.dict1 = {}

        for i in range(len(self.genre)):
            self.dict1[self.genre[i]] = list(
                set([
                    keyword for keyword in dictionary[self.genre[i]]
                    if not pd.isna(keyword)
                ]))

        self.seed_topic_list = [self.dict1[topic] for topic in self.genre]
        self.vectorizer = None
        self.tf_dtm = None
        self.vocab = None

        self.new_topic_keywords = None

    def get_dtm(self, corpus, min_df, max_df):
        ''' Get tfidf document-term matrix from pre-processed corpus
        
        Args:
            corpus (series): panda series containing documents in pre-processed corpus
            min_df (int): parameter in CountVectorizer for minimum documents that keywords should appear in 
            max_df (int): parameter in CountVectorizer for maximum documents that keywords should appear in 
            
        Returns:
            dtm (array): tfidf document-term matrix
        '''
        self.vectorizer = CountVectorizer(ngram_range=(1, 1),
                                          min_df=min_df,
                                          max_df=max_df)
        self.vectorizer.fit(corpus)
        self.tf_dtm = self.vectorizer.transform(corpus)

        transformer = TfidfTransformer()
        dtm = transformer.fit_transform(self.tf_dtm)

        self.vocab = self.vectorizer.get_feature_names()

        return dtm

    def get_baseline_score(self, movies, cutoff=1):
        ''' Get baseline performance metrics (precision,recall, F1) for keyword matching method
        
        Args:
            movies (dataframe): pandas dataframe of pre-processed dataset
            cutoff (int): threshold number of keywords for a movie to belong to a genre
            
        Returns:
            base_yres_test (dataframe): Baseline performance metrics for each genre
        '''

        indices = np.array(range(movies.shape[0]))

        # 20% hold-out for test data
        train_idx0, test_idx = train_test_split(indices,
                                                test_size=0.2,
                                                random_state=42)

        n = len(self.dict1.keys())
        topic_list = [
            self.dict1[self.genre[i]] for i in range(len(self.genre))
        ]
        dict1_list = [[keyword for keyword in topic if not pd.isna(keyword)]
                      for topic in topic_list]
        dict1_list_idx = [[
            self.vocab.index(keyword) for keyword in topic
            if not pd.isna(keyword)
        ] for topic in topic_list]

        doc_topic = pd.DataFrame(np.zeros((self.tf_dtm.shape[0], n)),
                                 columns=self.genre)
        for i in range(n):
            keywords_all_doc = self.tf_dtm[:, dict1_list_idx[i]].toarray()
            keywords_all_doc[keywords_all_doc > 0] = 1
            doc_topic.iloc[:, i] = keywords_all_doc.sum(axis=1)

        ypred = (doc_topic >= cutoff).astype(int)
        ypred_test = ypred.iloc[test_idx, :]
        ytrue_test = movies.iloc[test_idx, 1:]

        base_yres_test = pd.DataFrame(
            np.zeros((3, len(self.genre))),
            index=['Precision', 'Recall', 'F1-score'],
            columns=self.genre)
        for i in range(len(self.genre)):

            base_yres_test.iloc[0, i] = round(
                precision_score(ytrue_test.iloc[:, i], ypred_test.iloc[:, i]),
                4)
            base_yres_test.iloc[1, i] = round(
                recall_score(ytrue_test.iloc[:, i], ypred_test.iloc[:, i]), 4)
            base_yres_test.iloc[2, i] = round(
                f1_score(ytrue_test.iloc[:, i], ypred_test.iloc[:, i]), 4)

        return base_yres_test

    def get_full_cooccurence_matrix(self):
        ''' Get full co-occurence(npmi) matrix showing pairwise npmi scores for every word in the corpus vocabulary
        
        Args:
            None
            
        Returns:
            npmi_full (dataframe): pandas dataframe for co-occurence(npmi) matrix 
        '''

        #word-word cooccurence matrix
        X = self.tf_dtm
        X[X > 0] = 1
        Xc = (X.T * X)  # this is co-occurrence matrix in sparse csr format
        Xc.setdiag(0)  # sometimes you want to fill same word cooccurence to 0
        Xc = Xc.toarray()

        #npmi matrix
        word_count = self.tf_dtm.sum(axis=0).tolist()[0]
        corpus_word_count = self.tf_dtm.sum(axis=0).sum()

        pa = np.array(word_count).reshape(len(word_count),
                                          1) / corpus_word_count
        pb = np.array(word_count).reshape(1,
                                          len(word_count)) / corpus_word_count
        pab = Xc / corpus_word_count
        valid_pab = (pab != 0).astype(int)

        npmi = np.log((pab + 2 * (1 - valid_pab)) /
                      (pa * pb)) / -np.log(pab + 2 * (1 - valid_pab))
        npmi = npmi * valid_pab

        npmi = abs(npmi.clip(min=0))
        npmi_full = pd.DataFrame(npmi, index=self.vocab, columns=self.vocab)

        return npmi_full

    def generate_virtual_doc(self, npmi_full, percentile=70):
        ''' Generate virtual document for each seeded keywords
        Documents contains keywords from corpus vocabulary that highly co-occured with seeded keywords.
        Serve as a method to restrict the npmi matrix.

        Args:
            npmi_full (array): full npmi matrix
            percentile (int): cutoff threshold for keywords to be selected for virtual documents
            
        Returns:
            vodc (list): list of virtual documents
        '''

        # Generation of virtual documents
        npmi_sel = npmi_full.copy(deep=True)
        seed_words = [
            items for substring in self.seed_topic_list for items in substring
        ]

        npmi_sel.index = npmi_full.columns
        npmi_sel = npmi_sel.loc[seed_words, :]

        cutoff_lst = []
        for i in range(len(seed_words)):
            cutoff_lst.append(
                np.percentile(
                    npmi_sel.iloc[i, :].values[npmi_sel.iloc[
                        i, :].to_numpy().nonzero()], percentile))

        vdoc = [
            ' '.join([seed_words[i]] + [
                self.vocab[idx] for idx, val in enumerate(npmi_sel.iloc[i, :])
                if val >= np.max(cutoff_lst)
            ]) for i in range(len(seed_words))
        ]

        return vdoc

    def get_restricted_npmi_vectors(self, vdoc, npmi_full, size=300):
        ''' Restrict npmi matrix by virtual document and perform dimension reduction  

        Args:
            vdoc (list): list of virtual documents
            npmi_full (array): full npmi matrix
            size (int): final length for dimension reduction
            
        Returns:
            npmi_embed (array): restricted npmi matrix
            vdoc_vocab (array): keywords from virtual documents
        '''
        # Restrict npmi matrix
        vectorizer = CountVectorizer(ngram_range=(1, 1))
        vectorizer.fit(vdoc)
        vdoc_vocab = vectorizer.get_feature_names()

        npmi_vdoc = npmi_full.copy(deep=True)

        npmi_vdoc = npmi_vdoc.loc[:, vdoc_vocab]
        npmi_vdoc.index = npmi_full.columns
        npmi_vdoc = npmi_vdoc.loc[vdoc_vocab, :]

        #dimension reduction using matrix factorization
        nmf = NMF(n_components=size, random_state=42, alpha=1, l1_ratio=0.0)
        nmf.fit(npmi_vdoc)

        npmi_embed = nmf.transform(npmi_vdoc)

        return npmi_embed, vdoc_vocab

    def customized_nmf(self, npmi_embed, vdoc_vocab):
        ''' Non-negative matrix factorization (with customized H matrix) on restricted npmi matrix

        Args:
            npmi_embed (array): restricted npmi matrix
            vdoc_vocab (array): keywords from virtual documents
            
        Returns:
            nmf (sklearn object): fitted nmf model
        '''
        n = len(self.dict1.keys())

        nmf = NMF(n_components=n,
                  random_state=42,
                  init='custom',
                  alpha=1,
                  l1_ratio=0.0)
        dtm_used = np.transpose(npmi_embed)

        # Customize W and H matrices
        avg = np.sqrt(dtm_used.mean() / n)
        seed_idx_list = [[
            vdoc_vocab.index(seed) for seed in self.seed_topic_list[i]
        ] for i in range(len(self.seed_topic_list))]
        H = np.zeros((n, dtm_used.shape[1]))
        for i in range(len(self.seed_topic_list)):
            for idx in seed_idx_list[i]:
                H[i, idx] = avg * 100

        W = avg * np.random.RandomState(42).randn(dtm_used.shape[0], n)
        W = np.abs(W)

        nmf.fit(dtm_used, H=H, W=W)

        return nmf

    def new_words(self, nmf, vdoc_vocab, n_words=20):
        '''Get additional new words that converged around seeded keywords from nmf topic modeling

        Args:
            nmf (sklearn object): fitted nmf model
            vdoc_vocab (array): keywords from virtual documents
            n_words (int): top number of words to consider from nmf model
            
        Returns:
            new_words_df (dataframe): dataframe to check the additional new words found
        '''
        keywords = np.array(vdoc_vocab)
        self.new_topic_keywords = []
        for topic_weights in nmf.components_:
            top_keyword_locs = (-topic_weights).argsort()[:n_words + 1]
            self.new_topic_keywords.append(keywords.take(top_keyword_locs))

        # Topic - New Keywords Dataframe
        new_words = [[
            new_word for new_word in list(self.new_topic_keywords[i])
            if new_word not in self.dict1[self.genre[i]]
        ] for i in range(len(self.genre))]
        new_words_df = pd.DataFrame(new_words, index=self.genre).T

        return new_words_df

    def pruning(self, npmi_full, vdoc_vocab, cutoff=1):
        '''Prune keywords from enriched dictionary that highly occured with keywords in other topics
        
        Args:
            npmi_full (array): full npmi matrix
            vdoc_vocab (array): keywords from virtual documents
            cutoff (float): npmi threshold for pruning. Set cutoff = 1 if no pruning required.
            
        Returns:
            dict2_sel (dictionary): enriched dictionary as a dictionary for downstream processing
            dict2_list_idx (list): list of vocabulary index for keywords in each genre
            enriched_dict: (dataframe): enriched dictionary as a dataframe for output and checking 
        '''

        # Remove dictionary words that highly co-occured in other topics
        dict2 = {
            self.genre[i]: list(
                set(
                    list(self.dict1[self.genre[i]]) +
                    list(self.new_topic_keywords[i])))
            for i in range(len(self.genre))
        }
        dict2_sel = {}

        for i in range(len(self.genre)):
            potential = dict2[self.genre[i]]
            other_words = [[
                word for word in dict2[self.genre[n]] if word not in potential
            ] for n in range(len(self.genre))]
            other_words = [
                items for substring in other_words for items in substring
            ]
            other_words_idx = [vdoc_vocab.index(word) for word in other_words]

            dict2_sel[self.genre[i]] = []

            for j in range(len(potential)):
                npmi_val = npmi_full.iloc[vdoc_vocab.index(potential[j]),
                                          other_words_idx]
                if np.sum(npmi_val >= cutoff) == 0:
                    dict2_sel[self.genre[i]].append(potential[j])

        topic_list = [dict2_sel[self.genre[i]] for i in range(len(self.genre))]
        dict2_list_idx = [[
            self.vocab.index(keyword) for keyword in topic
            if not pd.isna(keyword)
        ] for topic in topic_list]

        enriched_dict = pd.DataFrame(
            [dict2_sel[self.genre[i]] for i in range(len(self.genre))],
            index=self.genre).T

        return dict2_sel, dict2_list_idx, enriched_dict
Example #35
0
df_tmp['text'] = df_tmp['user_items'].apply(
    lambda x: " ".join([str(i) for i in x]))
txt1 = data.groupby('user_id')['user_items'].apply(
    lambda x: " ".join(x)).reset_index()['user_items']
X = list(txt1.values)
tfv = TfidfVectorizer(min_df=35)
tfv.fit(X)

train_uid = train_uid.merge(df_tmp, on='user_id', how='left')
test_uid = test_uid.merge(df_tmp, on='user_id', how='left')

traintext_tfidf = tfv.transform(train_uid['text'].values)
sparse.save_npz('./tf_idf_feats/traintext_tfidf3.npz', traintext_tfidf)
testtext_tfidf = tfv.transform(test_uid['text'].values)
sparse.save_npz('./tf_idf_feats/testtext_tfidf3.npz', testtext_tfidf)

# #### 构建COUNTVEC特征

os.system('mkdir -pv ../../data/lgb/countvec_feats')

cv = CountVectorizer(min_df=30)
cv.fit(df_tmp['text'])

train_ta = cv.transform(train_uid['text'])
sparse.save_npz('../../data/lgb/countvec_feats/traintext_countvec2.npz',
                train_ta)
test_ta = cv.transform(test_uid['text'])
sparse.save_npz('../../data/lgb/countvec_feats/testtext_countvec2.npz',
                test_ta)
Example #36
0
sdata1 = pre_sentence(arr_file1)
cla1 = arr_file1[:,-1]

sdata2 = pre_sentence(arr_file2)
cla2 = arr_file2[:,-1]
id_ =  arr_file2[:,0]

X_train = sdata1
X_test = sdata2
y_train = cla1
y_test = cla2

count = CountVectorizer(token_pattern='[#@_$%\w\d]{2,}',lowercase = False)
X_train_bag_of_words = count.fit_transform(X_train)

X_test_bag_of_words = count.transform(X_test)

print("----dt")
clf = tree.DecisionTreeClassifier(min_samples_leaf=int(0.01*len(X_train)),criterion='entropy',random_state=0)
model = clf.fit(X_train_bag_of_words, y_train)
predicted_y  = predict_and_test(model, X_test_bag_of_words)
for i in range(len(X_test)):
    print(id_[:][i],predicted_y[i])
    




    
    
    
Example #37
0
def main():
    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train-path", type=str, required=True,
                        help="Path to the train jsonl file.")
    parser.add_argument("--dev-path", type=str, required=True,
                        help="Path to the dev jsonl file.")
    parser.add_argument("--serialization-dir", "-s", type=str, required=True,
                        help="Path to store the preprocessed output.")
    parser.add_argument("--vocab-size", type=int, required=False, default=10000,
                        help="Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument("--tokenize", action='store_true',
                        help="Path to store the preprocessed corpus vocabulary (output file name).") 
    parser.add_argument("--tokenizer-type", type=str, default="just_spaces",
                        help="Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument("--reference-corpus-path", type=str, required=False,
                        help="Path to store the preprocessed corpus vocabulary (output file name).")
    parser.add_argument("--tokenize-reference", action='store_true',
                        help="Path to store the preprocessed corpus vocabulary (output file name).") 
    parser.add_argument("--reference-tokenizer-type", type=str, default="just_spaces",
                        help="Path to store the preprocessed corpus vocabulary (output file name).")
    args = parser.parse_args()

    if not os.path.isdir(args.serialization_dir):
        os.mkdir(args.serialization_dir)
    
    vocabulary_dir = os.path.join(args.serialization_dir, "vocabulary")

    if not os.path.isdir(vocabulary_dir):
        os.mkdir(vocabulary_dir)

    tokenized_train_examples = load_data(args.train_path, args.tokenize, args.tokenizer_type)
    tokenized_dev_examples = load_data(args.dev_path, args.tokenize, args.tokenizer_type)

    rationale_dir = lambda x: os.path.join(os.path.dirname(x), "rationales", os.path.basename(x))
    tokenized_train_rationales = load_data(rationale_dir(args.train_path), args.tokenize, args.tokenizer_type)
    tokenized_dev_rationales = load_data(rationale_dir(args.dev_path), args.tokenize, args.tokenizer_type)

    print("fitting count vectorizer...")

    count_vectorizer = CountVectorizer(stop_words='english', max_features=args.vocab_size, token_pattern=r'\b[^\d\W]{3,30}\b')
    
    text = (tokenized_train_examples +
            tokenized_dev_examples +
            tokenized_train_rationales +
            tokenized_dev_rationales)
    
    count_vectorizer.fit(tqdm(text))

    vectorized_train_examples = count_vectorizer.transform(tqdm(tokenized_train_examples))
    vectorized_dev_examples = count_vectorizer.transform(tqdm(tokenized_dev_examples))

    vectorized_train_rationales = count_vectorizer.transform(tqdm(tokenized_train_rationales))
    vectorized_dev_rationales = count_vectorizer.transform(tqdm(tokenized_dev_rationales))

    reference_vectorizer = CountVectorizer(stop_words='english', token_pattern=r'\b[^\d\W]{3,30}\b')
    if not args.reference_corpus_path:
        print("fitting reference corpus using development data...")
        reference_matrix = reference_vectorizer.fit_transform(tqdm(tokenized_dev_examples))
    else:
        print(f"loading reference corpus at {args.reference_corpus_path}...")
        reference_examples = load_data(args.reference_corpus_path, args.tokenize_reference, args.reference_tokenizer_type)
        print("fitting reference corpus...")
        reference_matrix = reference_vectorizer.fit_transform(tqdm(reference_examples))

    reference_vocabulary = reference_vectorizer.get_feature_names()

    # add @@unknown@@ token vector
    vectorized_train_examples = sparse.hstack((np.array([0] * len(tokenized_train_examples))[:,None], vectorized_train_examples))
    vectorized_dev_examples = sparse.hstack((np.array([0] * len(tokenized_dev_examples))[:,None], vectorized_dev_examples))
    
    vectorized_train_rationales = sparse.hstack((np.array([0] * len(tokenized_train_rationales))[:,None], vectorized_train_rationales))
    vectorized_dev_rationales = sparse.hstack((np.array([0] * len(tokenized_dev_rationales))[:,None], vectorized_dev_rationales))
    
    master = sparse.vstack([vectorized_train_examples, vectorized_dev_examples, vectorized_train_rationales, vectorized_dev_rationales])

    # generate background frequency
    print("generating background frequency...")
    bgfreq = dict(zip(count_vectorizer.get_feature_names(), (np.array(master.sum(0)) / args.vocab_size).squeeze()))

    print("saving data...")
    save_sparse(vectorized_train_examples, os.path.join(args.serialization_dir, "train.npz"))
    save_sparse(vectorized_dev_examples, os.path.join(args.serialization_dir, "dev.npz"))

    save_sparse(vectorized_train_rationales, os.path.join(args.serialization_dir, "train_rationales.npz"))
    save_sparse(vectorized_dev_rationales, os.path.join(args.serialization_dir, "dev_rationales.npz"))

    if not os.path.isdir(os.path.join(args.serialization_dir, "reference")):
        os.mkdir(os.path.join(args.serialization_dir, "reference"))
    save_sparse(reference_matrix, os.path.join(args.serialization_dir, "reference", "ref.npz"))
    write_to_json(reference_vocabulary, os.path.join(args.serialization_dir, "reference", "ref.vocab.json"))
    write_to_json(bgfreq, os.path.join(args.serialization_dir, "vampire.bgfreq"))
    
    write_list_to_file(['@@UNKNOWN@@'] + count_vectorizer.get_feature_names(), os.path.join(vocabulary_dir, "vampire.txt"))
    write_list_to_file(['*tags', '*labels', 'vampire'], os.path.join(vocabulary_dir, "non_padded_namespaces.txt"))
Example #38
0
train_sentiments = (train_sentiments.replace({
    'positive': 1,
    'negative': 0
})).values
test_sentiments = (test_sentiments.replace({
    'positive': 1,
    'negative': 0
})).values

corpus_train = CleanUpData(train_reviews)
corpus_test = CleanUpData(test_reviews)
#corpus_train = CleanUpData(train)
#corpus_test = CleanUpData(test)

count_vec = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train = count_vec.fit_transform(corpus_train)
count_vec_test = count_vec.transform(corpus_test)

linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000)
linear_svc_count.fit(count_vec_train, train_sentiments)

predict_count = linear_svc_count.predict(count_vec_test)

print(
    "Classification Report: \n",
    classification_report(test_sentiments,
                          predict_count,
                          target_names=['Negative', 'Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_sentiments, predict_count))
print("Accuracy: \n", accuracy_score(test_sentiments, predict_count))
Example #39
0
    def predict_posts(self):

        docs_train, docs_test, y_train, y_test = train_test_split(
            X, y, test_size=0.01, random_state=42)

        print("Number of data point is " + str(len(y)))

        ###############
        # uncomment either one of the below
        # predict unlabelled tweet OR test classifier on gold standard
        ###############

        # dataset_topredict = pd.read_csv(path_to_file_to_be_predicted, header=0, names=['tweets'])
        dataset_topredict = pd.read_csv(path_to_gold_standard_file,
                                        header=0,
                                        names=['tweets', 'class'])

        X_topredict = dataset_topredict['tweets']
        y_goldstandard = dataset_topredict['class']

        ###############
        # train classifier
        ###############

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data ...")
        clf = SGDClassifier(loss=_loss,
                            penalty=_penalty,
                            alpha=_alpha,
                            random_state=42).fit(X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_tfidf,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation
        # the factor two is to signify 2 sigma, which is 95% confidence level

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier to predict tweets
        ##################

        X_test_CV = count_vect.transform(X_topredict)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        ##################
        # run classifier on gold standard (tweets that were labelled by twitter insight)
        ##################

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_tfidf, y_goldstandard))

        print(metrics.classification_report(y_goldstandard, y_predicted))
        cm = metrics.confusion_matrix(y_goldstandard, y_predicted)
        print(cm)

        ##################
        # write prediction results to file
        ##################

        f = open(path_to_store_predicted_results, 'w')

        for yp in y_predicted:
            f.write(yp + '\n')

        f.close()
Example #40
0
    def completeFiltering(singleStringTxt,
                          multiLineTxt,
                          limitOnFreq,
                          limitOnDataW=10000):
        wholeText = singleStringTxt
        cleansed = wholeText.split()[:limitOnDataW]
        table = str.maketrans("", "", string.punctuation)
        cleansed = [w.translate(table) for w in cleansed]
        patched = " ".join(cleansed)
        cleansed = patched.split()
        cleansed = [
            words for words in cleansed
            if not words.lower() in stopwords.words()
        ]

        cleansedTxt = " ".join(cleansed)

        wholeText = [cleansedTxt]
        lineWiseText = multiLineTxt

        # list of text documents
        # create the transform
        vectorizer1 = CountVectorizer()
        vectorizer2 = CountVectorizer()
        # tokenize and build vocab
        vectorizer1.fit(wholeText)
        vectorizer2.fit(lineWiseText)

        # summarize
        wToInd1 = vectorizer1.vocabulary_
        wToInd2 = vectorizer2.vocabulary_
        # encode document
        vector1 = vectorizer1.transform(wholeText)
        vector2 = vectorizer2.transform(lineWiseText)
        # summarize encoded vector
        v1 = vector1.toarray()
        v2 = vector2.toarray()

        finalCount = np.sum(v1, axis=0, keepdims=False)

        countDict1 = dict()

        countDict2 = dict()
        priorities2 = dict()
        for ind in range(len(finalCount)):
            if finalCount[ind] >= limitOnFreq:
                countDict1[getKey(wToInd1, ind)] = finalCount[ind]

        for lines in range(v2.shape[0]):
            countDict = dict()
            for ind in range(v2.shape[1]):
                if v2[lines][ind] >= limitOnFreq:
                    countDict[getKey(wToInd2, ind)] = v2[lines][ind]

            priorities = sorted(countDict, key=countDict.get, reverse=True)

            countDict2[str(lines + 1)] = countDict
            priorities2[str(lines + 1)] = priorities

        contentWords = superImportant("Apple Inc")
        countDict1, misMatch = changePriorities(countDict1, contentWords)
        print("These many got mismatched : ", misMatch)

        priorities1 = sorted(countDict1, key=countDict1.get, reverse=True)

        return priorities1, priorities2, countDict1, countDict2
Example #41
0
class LogisticRegression:
    class Splitter:
        """Represents a set of training/testing data. self.train is a list of Examples, as is self.dev. 
        """
        def __init__(self):
            self.train = []
            self.dev = []
            self.test = []

    class Example:
        """Represents a document with a label. klass is 1 if 'pos' and 0 if 'neg' by convention.
           words is a string (a single movie review).
        """
        def __init__(self):
            self.klass = -1
            self.words = ''

    def __init__(self):
        """Logistic Regression initialization"""
        self.INCLUDE_LEXICON = False
        self.stopList = set(self.readFile(os.path.join('data',
                                                       'english.stop')))
        self.posWords = set(
        )  # positive opinion lexicon obtained from http://web.stanford.edu/class/cs124/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt
        self.negWords = set()  # negative opinion lexicon
        self.vect = CountVectorizer(min_df=20, ngram_range=(1, 1))

        self.X = []  #input values       (N x m) for N examples, m features
        self.Y = []  #true labels        (N x 1) for N examples
        self.weight = []  #weight vector (m x 1) for m features
        self.b = 0  #bias                (N x 1) for N examples
        #TODO: add other data structures needed in the functions you implement below

    #############################################################################
    # TODO TODO TODO TODO TODO
    # Implement a logistic regression model to classify movie review sentiment as either
    # positive or negative using logistic regression.
    #
    # If the INCLUDE_LEXICON  flag is true, add two more features to our Logistic Regression model
    # Feature 1: the total number of positive words in a review
    # Feature 2: the total number of negative words in a review.
    # We have already preprocessed the NRC emotion lexicon for you,
    # and you can access the set of lexicon words in self.posWords and self.negWords.
    # We have also provided the function addFeatures() that takes in the original feature matrix X
    # and a list of lists as the second argument; each list element is a feature vector for one input.
    # This allows the logistic regression model to include your own features along with those from CountVectorizer()
    """ Implement a function to train a logistic regression model.
        Use vectors self.X to store your inputs and self.Y your labels.
        self.vect is a countVectorizer we have created for you. Use it
        to obtain a unigram feature vector for your training data.
        Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
        It creates a sparse matrix containing counts of words in each
        review keeping only those that appear more times than the set threshold(20).
        
        Arguments:
        trainData -- the training data which a list of examples (see class Example: above to see how it's 
        initialized and what its members are). Each example is a single review and its true class.
    """

    def train(self, trainData):
        ### Start your code here

        corpus = [
            example.words for example in trainData
        ]  # returns list of strings, one string for each example/review
        self.X = self.vect.fit_transform(corpus)  # see documentation
        #.vectorize.fit_transform isolates the features (across all the strings/examples in the corpus)
        # and turns X into an array where each "example" becomes a feature-length vector of counts of each feature
        # so now self.X should have all the features (vocab) in the corpus
        self.Y = [example.klass for example in trainData]

        # HINT: Call self.X.todense()  after you've populated self.X with counts.
        # This converts it from a sparse matrix to a dense matrix so
        # we can use it to perform gradient descent.

        self.X = self.X.todense(
        )  # now self.X is a dense matrix (?): this has shape (N x m) where N = num_examples, m = num_features

        # HINT: You can use self.addFeatures to add more features.
        # The first argument should be self.X and the second
        # a list of lists. Each list is a feature vector for one document.

        if self.INCLUDE_LEXICON == True:
            # need to do .addFeatures once for each feature (since i can only add one list at a time)
            # and each list is a (list of lists) for *one* feature (the same features for all the reviews in the corpus)
            pos_feat = []
            neg_feat = []

            for review in corpus:  # for each sentence/review in the corpus (the training data):
                tokens = review.split()  # for a given review, split everything
                pos_count = 0
                neg_count = 0
                for token in tokens:  # for each word in this given review
                    if token in self.posWords: pos_count += 1
                    if token in self.negWords: neg_count += 1
                pos_feat.append([pos_count])
                neg_feat.append([neg_count])

            self.X = self.addFeatures(
                self.X, pos_feat)  # pos_feat needs to be a list of lists
            self.X = self.addFeatures(self.X, neg_feat)

        # HINT: Use np.zeros() to initialize self.weight to zeros

        m = np.shape(
            self.X)[1]  # number of columns in self.X = number of features
        self.weight = np.zeros((m, 1))  # self.weight = (m x 1) for m features

        # HINT: Call self.gradientDescent to train your model

        self.gradientDescent()  # well this is convenient

        ###End your code here

    """
    Compute the sigmoid function for the input here.
    Arguments:
    x -- A scalar or numpy array.
    Return:
    s - sigmoid(x)
    HINT: use np.exp() because your input can be a numpy array
    """

    def sigmoid(self, x):
        ### START YOUR CODE HERE

        return 1 / (1 + np.exp(-x))

        ### END YOUR CODE HERE

    """
    Predict what class an input belongs to based on its score.
    If sigmoid(X.W+b) is greater or equal to 0.5, it belongs to
    class 1 (positive class) otherwise, it belongs to class 0
    (negative class). Use the sigmoid function you implemented above.
    HINT: Use self.weight to get W and self.b to get b
    HINT: You can use np.dot to calculate the dot product of arrays
    Arguments:
    x -- A scalar or numpy array.
    Return:
    k -- the predicted class
    """

    def predict(self, x):  # this predicts only for a given row ("example")
        assert x.shape[
            0] == 1, "x has the wrong shape. Expected a row vector, got: " + str(
                x.shape)
        ### Start your code here

        W = self.weight
        b = self.b

        if self.sigmoid(np.dot(x, W) + b) >= 0.5: return 1
        else: return 0

        ### End your code here

    """
    Classify a string of words as either positive (klass =1) or negative (klass =0)
    Hint: Use the self.predict class you implemented above
    Hint: Use self.vect to get a unigram feature vector
    Hint: self.predict takes a row vector as an argument
    Hint: self.vect.transform() takes a list of strings as inputs
    Arguments:
    words -- A string of words (a single movie review)
    Return:
    k - the predicted class. 1 = positive. 0 = negative
    """

    def classify(self, words):
        ## Start your code here

        # use self.vect to get a unigram feature vector

        uni_x = self.vect.transform([words])
        uni_x = uni_x.todense()  # same as what i do in the training

        # also account for .INCLUDE_LEXICON (same as in training)

        if self.INCLUDE_LEXICON == True:
            pos_count = 0
            neg_count = 0
            for token in words.split(
            ):  # 'words' is a string; words.split() splits on space
                if token in self.posWords: pos_count += 1
                if token in self.negWords: neg_count += 1
            uni_x = self.addFeatures(
                uni_x, [[pos_count]])  #addFeatures has to take a list of lists
            uni_x = self.addFeatures(uni_x, [[neg_count]])

        # use self.predict (which takes row vector as argument) and returns prediction

        return self.predict(
            uni_x)  # self.predict takes row vector of features, returns 0 or 1

        ###End your code here


# END TODO(Modify code beyond here with caution)#######################################################################################
## Adds features to self.X by concatenating the unigram matrix with features

    def addFeatures(self, feature1, feature2):
        assert feature1.shape[0] == len(
            feature2), "features have mismatched shape"
        return np.concatenate((feature1, np.array(feature2)), axis=1)

    ## Loss function used for logistic regression
    def loss(self, a, y):
        return (-1 / y.shape[0]) * (np.dot(y.T, (np.log(a))) + np.dot(
            (1 - y).T, (np.log(1 - a))))

    def gradientDescent(self, alpha=0.001, numiters=1000):
        self.Y = np.array(self.Y).reshape((-1, 1))
        loss = 0
        for i in range(numiters):
            Z = np.dot(self.X, self.weight) + self.b
            A = self.sigmoid(Z)
            grad = np.dot(self.X.T, (A - self.Y)) / self.Y.shape[0]
            db = np.sum(A - self.Y) / self.Y.shape[0]
            self.weight -= alpha * grad
            self.b -= alpha * db
            prevLoss = loss
            loss = self.loss(A, self.Y)
            stepSize = abs(prevLoss - loss)

            if stepSize[0, 0] < 0.000001:
                break

            if (i % 500 == 0):
                z = np.dot(self.X, self.weight) + self.b
                a = self.sigmoid(z)
                print("loss:" +
                      str(np.squeeze(np.array(self.loss(a, self.Y)))) +
                      "\t %d/%d iterations" % (i, numiters))

    def readFile(self, fileName):
        contents = []
        f = open(fileName, encoding='latin-1')
        contents = f.read()
        f.close()
        return contents

    def buildLexicon(self):
        filePath = os.path.join('data', 'NRC-emotion-lexicon.txt')
        lines = self.readFile(filePath).splitlines()
        for line in lines:
            word, emotion, value = line.split('\t')
            if emotion == 'positive' and int(value) == 1:
                self.posWords.add(word)
            if emotion == 'negative' and int(value) == 1:
                self.negWords.add(word)

    def buildSplit(self, include_test=True):
        split = self.Splitter()
        datasets = ['train', 'dev']
        if include_test:
            datasets.append('test')
        for dataset in datasets:
            for klass in ['pos', 'neg']:
                filePath = os.path.join('data', dataset, klass)
                dataFiles = os.listdir(filePath)
                for dataFile in dataFiles:
                    words = self.readFile(os.path.join(filePath,
                                                       dataFile)).replace(
                                                           '\n', ' ')
                    example = self.Example()
                    example.words = words
                    example.words = self.filterStopWords(example.words.split())
                    example.klass = 1 if klass == 'pos' else 0
                    if dataset == 'train':
                        split.train.append(example)
                    elif dataset == 'dev':
                        split.dev.append(example)
                    else:
                        split.test.append(example)
        return split

    def filterStopWords(self, words):
        """Filters stop words."""
        filtered = []
        for word in words:
            if not word in self.stopList and word.strip() != '':
                filtered.append(word)
        return ' '.join(filtered)
Example #42
0
    def transform(self, X):
        """Transform X using specified encoding scheme.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(Xi)

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        if self.encoding == 'ngram-count':
            out = []
            for j, cats in enumerate(self.categories_):
                n = int(encoder[0])
                vectorizer = CountVectorizer(analyzer='char',
                                             ngram_range=(self.n, self.n))
                vectorizer.fit(cats)
                encoder = vectorizer.transform(X[:, j])
                out.append(encoder)
            return sparse.hstack(out)

        if self.encoding == 'ngram-presence':
            out = []
            for j, cats in enumerate(self.categories_):
                vectorizer = CountVectorizer(analyzer='char',
                                             ngram_range=(self.n, self.n))
                vectorizer.fit(cats)
                encoder = vectorizer.transform(X[:, j])
                encoder = (encoder > 0).astype(self.dtype)
                out.append(encoder)
            return sparse.hstack(out)

        if self.encoding == 'ngram-tfidf':
            out = []
            for j, cats in enumerate(self.categories_):
                n = int(encoder[0])
                B = np.unique(B)
                vectorizer = TfidfVectorizer(analyzer='char',
                                             ngram_range=(self.n, self.n),
                                             smooth_idf=False)
                vectorizer.fit(cats)
                encoder = vectorizer.transform(X[:, j])
                out.append(encoder)
            return np.hstack(out)

        if self.encoding == 'similarity':
            if self.similarity == 'levenshtein-ratio':
                out = []
                for j, cats in enumerate(self.categories_):
                    unqX = np.unique(X[:, j])
                    vect = np.vectorize(lev.ratio)
                    encoder_dict = {x: vect(x, cats.reshape(1, -1))
                                    for x in unqX}
                    encoder = [encoder_dict[x] for x in X[:, j]]
                    encoder = np.vstack(encoder)
                    out.append(encoder)
                return np.hstack(out)

            if self.similarity == 'sorensen':
                out = []
                for j, cats in enumerate(self.categories_):
                    unqX = np.unique(X[:, j])
                    vect = np.vectorize(dist.sorensen)
                    encoder_dict = {x: vect(x, cats.reshape(1, -1))
                                    for x in unqX}
                    encoder = [encoder_dict[x] for x in X[:, j]]
                    encoder = 1 - np.vstack(encoder)
                    out.append(encoder)
                return np.hstack(out)

            if self.similarity == 'jaro-winkler':
                out = []
                for j, cats in enumerate(self.categories_):
                    unqX = np.unique(X[:, j])
                    vect = np.vectorize(jellyfish.jaro_distance)
                    encoder_dict = {x: vect(x, cats.reshape(1, -1))
                                    for x in unqX}
                    encoder = [encoder_dict[x] for x in X[:, j]]
                    encoder = np.vstack(encoder)
                    out.append(encoder)
                return np.hstack(out)

            if self.similarity == 'ngram':
                out = []
                for j, cats in enumerate(self.categories_):
                    encoder = ngram_similarity(X[:, j], cats,
                                               self.n, self.ngram_type,
                                               dtype=self.dtype)
                    out.append(encoder)
                # '3gram_similarity2',
                # '3gram_similarity2_1',
                # '3gram_similarity4',
                # '3gram_similarity2_2',
                # '3gram_similarity5',
                return np.hstack(out)
        if self.encoding == 'target':
            def lambda_(x, n):
                out = x / (x + n)
                # out = 1.0
                return 1.0
            out = []
            for j, cats in enumerate(self.categories_):
                counter = collections.Counter(X[:, j])
                unqX = np.unique(X[:, j])
                n = len(X[:, j])
                k = len(cats)
                encoder = {x: 0 for x in unqX}
                if self.clf_type in ['binary_clf', 'regression']:
                    for x in unqX:
                        if x not in cats:
                            Eyx = 0
                        else:
                            Eyx = self.Eyx_[j][x]
                        lambda_n = lambda_(counter[x], n/k)
                        encoder[x] = lambda_n*Eyx + \
                            (1 - lambda_n)*self.Ey_[j]
                    x_out = np.zeros((len(X[:, j]), 1))
                    for i, x in enumerate(X[:, j]):
                        x_out[i, 0] = encoder[x]
                    out.append(x_out.reshape(-1, 1))
            out = np.hstack(out)
            return out

        if self.encoding == 'onehot':
            encoder = []
            for j, cats in enumerate(self.categories_):
                unqX = np.unique(X[:, j])
                cats_dict = {s: i for i, s in enumerate(cats)}
                encoder_unq = sparse.lil_matrix((len(unqX), len(cats)))
                for i, s in enumerate(unqX):
                    try:
                        encoder_unq[i, cats_dict[s]] = 1
                    except KeyError:
                        continue
                unqX_dict = {s: i for i, s in enumerate(unqX)}
                index = [unqX_dict[s] for s in X[:, j]]
                encoder.append(encoder_unq[index])
            out = sparse.hstack(encoder)
            return sparse.csr_matrix(out)
        if self.encoding == 'onehot-dense':
            encoder = []
            for j, cats in enumerate(self.categories_):
                unqX = np.unique(X[:, j])
                cats_dict = {s: i for i, s in enumerate(cats)}
                encoder_unq = sparse.lil_matrix((len(unqX), len(cats)))
                for i, s in enumerate(unqX):
                    try:
                        encoder_unq[i, cats_dict[s]] = 1
                    except KeyError:
                        continue
                unqX_dict = {s: i for i, s in enumerate(unqX)}
                index = [unqX_dict[s] for s in X[:, j]]
                encoder.append(encoder_unq[index])
            out = sparse.hstack(encoder)
            return out.toarray()
        else:
            return out
Example #43
0
    tok_words = []
    while True:
        try:
            line = file.readline()
            words += line
        except UnicodeDecodeError:
            continue
        if not line:
            break
    file_words.append(words)

print('Creating Feature 3 Vector... \n')

f3_vectorizer = CountVectorizer(stop_words='english')
f3_vectorizer.fit(file_words)
X_count_f3 = f3_vectorizer.transform(articles)
X_train_f3 = X_count_f3.toarray()

print('Feature 3 created\n')
#-----------------------------------------Feature Selection-------------------------------------------------------------------------------------------------------


def feature_creation(X1, X2, X3):
    features = []
    for i in range(len(X1)):
        f_list = list(X1[i])
        f2_list = list(X2[i])
        f3_list = list(X3[i])
        for x in range(len(f2_list)):
            f_list.append(f2_list[x])
        for x in range(len(f3_list)):
Example #44
0
# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# In[ ]:

vect.get_feature_names()[::2000]

# In[ ]:

len(vect.get_feature_names())

# In[ ]:

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

# In[ ]:

from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# In[ ]:

from sklearn.metrics import roc_auc_score
Example #45
0
    OHE.fit(data_x[feature].values.reshape(-1, 1))
    train_a = OHE.transform(x_train[feature].values.reshape(-1, 1))
    #valid_a=OHE.transform(x_valid[feature].values.reshape(-1,1))
    test_a = OHE.transform(data_test[feature].values.reshape(-1, 1))
    data_x_train = sparse.hstack((data_x_train, train_a))
    #data_x_valid=sparse.hstack((data_x_valid,valid_a))
    data_x_test = sparse.hstack((data_x_test, test_a))
print 'one_hot finish'

CVec = CountVectorizer(analyzer='word',
                       token_pattern=r'(?u)\b\w+\b',
                       tokenizer=lambda x: x.split(' '))
#CVec=CountVectorizer()
for feature in vector_feature:
    CVec.fit(data_x[feature])
    train_a = CVec.transform(x_train[feature])
    #valid_a=CVec.transform(x_valid[feature])
    test_a = CVec.transform(data_test[feature])
    data_x_train = sparse.hstack((data_x_train, train_a))
    #data_x_valid=sparse.hstack((data_x_valid,valid_a))
    data_x_test = sparse.hstack((data_x_test, test_a))
    df_tmp = pd.DataFrame(CVec.get_feature_names(), columns=['val'])
    #feature important mapping
    df_tmp['feature'] = '%s' % feature
    df_feature_map = pd.concat([df_feature_map, df_tmp])
print ' countvec finish'
save_path = "/home/heqt/tencent/20180506/all/"
df_feature_map.to_csv(save_path + "feature_important_mapping_cut.csv")

sparse.save_npz(save_path + "data_x_train_cut.npz", data_x_train)
x_train.to_csv(save_path + "x_train_cut.csv", index=None)
Example #46
0
    def train_classifier_use_feature_selection(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func,
                                    percentile=_percentile)

        print("Fitting data with feature selection ...")
        selector.fit(X_tfidf, y_train)

        # get how many features are left after feature selection
        X_features = selector.transform(X_tfidf)

        print("Shape of array after feature selection is " +
              str(X_features.shape))

        clf = SGDClassifier(loss=_loss,
                            penalty=_penalty,
                            alpha=_alpha,
                            n_iter=_n_iter,
                            random_state=42).fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file, 'w')

        for fb in feature_boolean:
            f.write(str(fb) + '\n')

        f.close()

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ####################
        # test clf on test data
        ####################

        X_test_CV = count_vect.transform(docs_test)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        # apply feature selection on test data too
        X_test_selector = selector.transform(X_test_tfidf)
        print("Shape of array for test data after feature selection is " +
              str(X_test_selector.shape))

        y_predicted = clf.predict(X_test_selector)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_selector, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
Example #47
0
targets = y_train
# print len(targets) #241221
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
# print len(X_train) #241221
classifier.fit(counts, targets)

############################################################
##                    進行檢測之結果儲存                     ##
############################################################
commList_Jieba_marked = []
for i in commList_Jieba:
    commList_Jieba_marked_dict = {}
    examples = [i["comments"]]
    # print i["comments"]
    example_counts = count_vectorizer.transform(examples)
    predictions = classifier.predict(example_counts)
    commList_Jieba_marked_dict["mark"] = predictions.tolist()
    # print predictions
    commList_Jieba_marked_dict["comments"] = [i["comments"]]
    commList_Jieba_marked_dict["hotel"] = [i["hotel"]]
    commList_Jieba_marked_dict["address"] = [i["address"]]
    commList_Jieba_marked_dict["noun"] = i["noun"]
    commList_Jieba_marked.append(commList_Jieba_marked_dict)

# print type(commList_Jieba_marked)
commList_Jieba_marked_json = json.dumps(commList_Jieba_marked,
                                        ensure_ascii=False)
# print type(commList_Jieba_marked_json)
with open("E:/Bayes_All_Comments_final.json", "w") as w:
    w.write(commList_Jieba_marked_json.encode("utf-8"))
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')


def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()


n = train.shape[0]

vec = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize)

trn_term_doc = vec.fit_transform(train[COMMENT])

test_term_doc = vec.transform(test[COMMENT])
trn_term_doc, test_term_doc


def pr(y_i, y):

    p = x[y == y_i].sum(0)

    return (p + 1) / ((y == y_i).sum() + 1)


x = trn_term_doc.sign()

test_x = test_term_doc.sign()

Example #49
0
    def train_classifier(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data ...")
        clf = SGDClassifier(loss=_loss,
                            penalty=_penalty,
                            alpha=_alpha,
                            n_iter=_n_iter,
                            random_state=42).fit(X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_tfidf,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_tfidf, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
def main_2():
    start = time.time()
    ## Get input arguments
    (options, args) = get_args()

    ## Fetch input data to be classified
    #train_data_df = get_data_df(options, args)
    train_data_df = get_data_df_2(options, args)

    #sys.exit()
    print(train_data_df.shape)
    print(train_data_df.columns)

    # Result holder for all categories
    selected_models_by_category = {}

    # Get unique list of categories with ids
    categories = pandas.unique(
        train_data_df.loc[:, ['category_full_path_mod1', 'category_id_mod1']].
        values)

    # Indicate classifiers to be tested
    clf_dict = {}
    #clf_dict['KNeighbors']          = KNeighborsClassifier()
    #clf_dict['Logistic Regression'] = LogisticRegression(penalty='l2')
    #clf_dict['Decision Tree']       = tree.DecisionTreeClassifier()
    #clf_dict['Random Forest']       = RandomForestClassifier()
    #clf_dict['MultinomialNB']       = MultinomialNB()
    #clf_dict['SVM']                 = svm.SVC(probability=True)
    #clf_dict['AdaBoost']            = AdaBoostClassifier()

    one_class_clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

    # Start iterating categories through classifiers
    for category in categories:
        print('=' * 100, '\nRun models for following category:\n', category[0],
              '\n', '=' * 100)

        # Get (positives, negatives) for training set
        (positives,
         negatives) = get_positives_negatives(train_data_df, category[0])

        # Create training set
        print('   Vectorizing training data ...')
        ng_range = (1, 1)
        count_vect = CountVectorizer(min_df=1,
                                     ngram_range=ng_range,
                                     binary=False)
        tfidf_vect = TfidfVectorizer(sublinear_tf=True,
                                     max_df=1.0,
                                     ngram_range=ng_range,
                                     stop_words='english')
        (X_train_counts,
         Y), count_vect_fitted = get_vectorized_data(positives,
                                                     negatives,
                                                     count_vect=count_vect,
                                                     tfidf_vect=None)
        X_train_counts_pos = count_vect.fit_transform(
            pandas.concat([positives]))

        print('   Training classifier ...')
        one_class_clf.fit(X_train_counts_pos)

        print('   Predicting ...')
        X_train_counts = count_vect.transform(
            pandas.concat([positives, negatives]))
        predictions = one_class_clf.predict(X_train_counts)

        print('   Prediction accuracy score: ', accuracy_score(Y, predictions))
        print('\nConfusion matrix:')
        display(
            pandas.crosstab(pandas.Series(Y),
                            predictions,
                            rownames=['True'],
                            colnames=['Predicted'],
                            margins=True))
Example #51
0
    def use_pipeline(self):

        #####################
        # Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
            ('vect',
             TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
            # ('clf', LinearSVC(C=1000)),
            ('clf', SGDClassifier(random_state=42))
        ])

        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
            'vect__use_idf': (True, False),
            'clf__loss': ('hinge', 'log'),
            'clf__penalty': ('l2', 'l1', 'elasticnet'),
            'clf__n_iter': (
                5, 10
            ),  #interesting because big n_iter doesn't perform as well as small one
            'clf__alpha': (0.001, 0.0001, 0.0005),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameters,
                                   cv=cv,
                                   n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_,
                                        key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        # y_predicted = clf_gs.predict(docs_test)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']
        loss = best_parameters['clf__loss']
        penalty = best_parameters['clf__penalty']
        alpha = best_parameters['clf__alpha']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data with best parameters ...")
        clf = SGDClassifier(loss=loss,
                            penalty=penalty,
                            alpha=alpha,
                            random_state=42).fit(X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_tfidf,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_tfidf, y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf, count_vect
Example #52
0
x_train.head()

# In[10]:

y_train = df_train.name  # y_train merupakan gender dari data

# In[11]:

print(x_train.shape)  # Menunjukkan ukuran dari data training
print(y_train.shape)

# ### Build x & y for testing data

# In[12]:

x_test = count_vect.transform(df_test["chat"])
y_test = df_test.name

# In[13]:

print(x_test.shape)  # Menunjukkan ukuran data testing
print(y_test.shape)

# ## Training

# In[14]:

clf = BernoulliNB(fit_prior=False)
clf.fit(x_train, y_train)  # Training data dengan Bernoulli Naive bAYES

# ## Test
# In[13]:


tmp = pd.DataFrame({'y':y_test_str})
tmp['y'].value_counts()


# In[14]:


from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train


# In[15]:


# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(sentences_train)
X_train_tfidf =  tfidf_vect.transform(sentences_train)
X_test_tfidf =  tfidf_vect.transform(sentences_test)
X_train_tfidf

# # ngram level tf-idf 
Example #54
0
    return result

#main 
data_list = getTrainingData()
question_list = [x for (x,y) in data_list]
label_list = [y for (x,y) in data_list]

print("\n question_list = ", question_list)
print("\n label_list = ", label_list)

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(question_list)
print("\n X.toarray()=", X.toarray())
print("\n X.shape[0]=",X.shape[0])

test_v = vectorizer.transform(['it is a test of sales order'])
print("\n test_v.toarray()=", test_v.toarray())

training_set= [ (getContentFeature(question_list[i]), label_list[i]) for i in range(X.shape[0])]
print("\n training_set=", training_set)
    
myClassifier = nltk.SklearnClassifier(SVC()).train(training_set)

#Test
while True:
    query = str(input("Input query:")).strip().lower()
    feature = getContentFeature(query)
    #print feature
    print("\n result = ", myClassifier.classify(feature))

Example #55
0
	#print X_train_tfidf.shape
	#print X_train_tfidf[:,324]
	

	new_collection = []
	new_sentiment_collection = []
	for i in range(1001,1100):
		content = news[i]["text"]
		tokens = [j.lower() for j in nltk.word_tokenize(content)]
		stemmed_text = ""
		for token in tokens:
			#print token.encode("utf-8"),
			parseds = morph.parse(token)
			#print parseds[0][2].encode("utf-8")
			stemmed_text = stemmed_text+" "+parseds[0][2]
		#print stemmed_text
		new_collection.append(stemmed_text)
		new_sentiment_collection.append(news[i]["sentiment"])
	
	X_new_counts = count_vect.transform(new_collection)
	X_new_tfidf = tf_transformer.transform(X_new_counts)

	clf = MultinomialNB().fit(X_train_tfidf,sentiment_collection)
	predicted = clf.predict(X_new_tfidf)
	print predicted
	for text,category,actual_category in zip(new_collection,predicted,new_sentiment_collection):
		print category," ",actual_category," ",text[:30]
		
	print confusion_matrix(new_sentiment_collection,predicted,labels=["positive","negative","neutral"])

Example #56
0
def Classification(MalwareCorpus, GoodwareCorpus, Split, Extn):
    '''
    Comparison between different online classifiers
    :param MalwareCorpus: Directory of malicious feature files
    :param GoodwareCorpus: Directory of benign feature files
    :param Split: test set split
    :param Extn: Extension of feature files
    :return:
    '''
    if 'datatxt' in Extn:
        Type = 'Drebin'
    elif 'WL2' in Extn:
        Type = 'WLK'
    elif '_pkg_adicfg_ret_.json.ADG.DirWLWODup' in Extn:
        Type = 'CWLK'
    else:
        Type = 'Other'

    # step 1 - split all samples to training set and test set
    logger.debug("Loading positive and negative samples file basename")

    MalSamples = GetFilesWithExtn(MalwareCorpus, Extn)
    GoodSamples = GetFilesWithExtn(GoodwareCorpus, Extn)[:len(MalSamples)]

    logger.info("All Samples loaded")
    print '# mal samples:', len(MalSamples)
    print '# good samples:', len(GoodSamples)

    TrainMalSamples = MalSamples[:int(len(MalSamples) * (1 - Split))]
    TestMalSamples = MalSamples[int(len(MalSamples) * (1 - Split)):]

    TrainGoodSamples = GoodSamples[:int(len(GoodSamples) * (1 - Split))]
    TestGoodSamples = GoodSamples[int(len(GoodSamples) * (1 - Split)):]

    logger.info("Training and test sets split finished")

    TrainMalLabels = np.ones(len(TrainMalSamples)).tolist()
    TestMalLabels = np.ones(len(TestMalSamples)).tolist()
    TrainGoodLabels = np.empty(len(TrainGoodSamples))
    TrainGoodLabels.fill(-1)
    TrainGoodLabels = TrainGoodLabels.tolist()
    TestGoodLabels = np.ones(len(TestGoodSamples))
    TestGoodLabels.fill(-1)
    TestGoodLabels = TestGoodLabels.tolist()
    logger.info("All labels created")

    TrainSamples = TrainMalSamples + TrainGoodSamples
    TestSamples = TestMalSamples + TestGoodSamples
    TrainLabels = TrainMalLabels + TrainGoodLabels
    TestLabels = TestMalLabels + TestGoodLabels
    NumTestMalSamples = len(TestMalLabels)

    del MalSamples, GoodSamples
    logger.info("All Samples loaded into training and testing sets")
    print "# Train Samples", len(TrainSamples)
    print "# Train Labels", len(TrainLabels)
    print "# Test Samples", len(TestSamples)
    print "# Test Labels", len(TestLabels)

    #step 2 - feature extracting
    TFIDFTransformer = TfidfTransformer()

    NewLineCVetorizer = CountVectorizer(input=u'filename',
                                        lowercase=True,
                                        token_pattern=None,
                                        tokenizer=NewLineTokenizer,
                                        dtype=np.float64)

    print 'performing count vectorizing'
    TrainDocsTermsFVs = NewLineCVetorizer.fit_transform(TrainSamples)
    TestDocsTermsFVs = NewLineCVetorizer.transform(TestSamples)
    print 'performing tf-idf vectorizing'
    TrainFVs = TFIDFTransformer.fit_transform(TrainDocsTermsFVs)
    TestFVs = TFIDFTransformer.transform(TestDocsTermsFVs)

    print 'train term-doc matrix: ', TrainFVs.shape  #rowsx cols, rows = docs, cols = features/terms
    print 'test term-doc matrix: ', TestFVs.shape

    #step 3- classification
    logger.info("Performing Cross Validation")

    EtaList = [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
    CWAccuracyList = []

    CList = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    AROWAccuracyList = []

    pool = MyPool(4)

    a = [
        pool.apply_async(GridSearchCV, (
            MCWVarDiag,
            5,
            e,
            len(TrainSamples),
            TrainFVs,
            TrainLabels,
        )) for e in EtaList
    ]
    CWAccuracyList = [res.get() for res in a]
    EtaBest = EtaList[CWAccuracyList.index(max(CWAccuracyList))]
    BestModel_CW = MCWVarDiag(EtaBest, epochs=50)

    a = [
        pool.apply_async(GridSearchCV,
                         args=(
                             ArowDiag,
                             5,
                             c,
                             len(TrainSamples),
                             TrainFVs,
                             TrainLabels,
                         )) for c in CList
    ]
    AROWAccuracyList = [res.get() for res in a]
    CBest = CList[AROWAccuracyList.index(max(AROWAccuracyList))]
    BestModel_AROW = ArowDiag(CBest, n_iters=50)
    pool.close()
    pool.join()

    print 'best model', BestModel_CW, max(CWAccuracyList)
    print 'best model', BestModel_AROW, max(AROWAccuracyList)

    logger.info("Applying Best Model on Testing Set")

    modeldict = {BestModel_CW: 'CW', BestModel_AROW: 'AROW'}
    for Model in [BestModel_CW, BestModel_AROW]:
        T0 = time()
        f = open(modeldict[Model] + '_' + Type + '.txt', 'w')
        f1 = open(modeldict[Model] + '_' + Type + '_Metadata.txt', 'w')

        Model.fit(TrainFVs, TrainLabels)
        PredictedLabels = []
        NewTestLabels = []
        i = 0
        for TestFV, TestLabel in zip(TestFVs, TestLabels):
            #Mal Sample
            if i < NumTestMalSamples:
                TestMalLabel = np.array([TestLabel])
                PredictedLabel = Model.predict(TestFV)
                PredictedLabels.append(float(PredictedLabel))
                NewTestLabels.append(TestLabel)
                if float(PredictedLabel) != TestLabel:
                    try:
                        Model.partial_fit(TestFV, TestLabel)  #update the model
                        logger.info("Model Partially Fitted")
                    except:
                        logger.error("Partially Fitted Failed")
                        pass
                PredictedMalLabel = np.array([float(PredictedLabel)])
                print >> f1, (metrics.classification_report(
                    TestMalLabel,
                    PredictedMalLabel,
                    target_names=['Sample', 'Sample']))
                print >> f1, "Zero-one classification loss:", metrics.zero_one_loss(
                    TestMalLabel, PredictedMalLabel)
                print >> f1, '-' * 100
            #Ben Sample
            if NumTestMalSamples + i < len(TestLabels):
                TestLabel = TestLabels[NumTestMalSamples + i]
                TestFV = TestFVs[NumTestMalSamples + i]
                TestGoodLabel = np.array([TestLabel])
                PredictedLabel2 = Model.predict(TestFV)
                PredictedLabels.append(float(PredictedLabel2))
                NewTestLabels.append(TestLabel)
                if float(PredictedLabel2) != TestLabel:
                    try:
                        Model.partial_fit(TestFVs[NumTestMalSamples + i],
                                          TestLabel)  #update the model
                        logger.info("Model Partially Fitted")
                    except:
                        logger.error("Partially Fitted Failed")
                        pass
                PredictedGoodLabel = np.array([float(PredictedLabel2)])
                print >> f1, (metrics.classification_report(
                    TestGoodLabel,
                    PredictedGoodLabel,
                    target_names=['Sample', 'Sample']))
                print >> f1, "Zero-one classification loss:", metrics.zero_one_loss(
                    TestGoodLabel, PredictedGoodLabel)
                print >> f1, '-' * 100
            i += 1

        if modeldict[Model] == 'CW':
            print >> f, 'Best Eta parameter', EtaBest
        elif modeldict[Model] == 'AROW':
            print >> f, 'Best C parameter', CBest
        print >> f, '-' * 100
        print >> f, '-' * 43 + 'Whole Database' + '-' * 43
        Accuracy = metrics.accuracy_score(PredictedLabels, NewTestLabels)
        print >> f, "Test Set Accuracy = ", Accuracy
        print >> f, 'testing time', time() - T0
        print >> f, (metrics.classification_report(
            NewTestLabels,
            PredictedLabels,
            target_names=['Goodware', 'Malware']))  # raw_input()

        print >> f, 'Classifier Top features'
        print >> f, '-' * 100
        Vocab = NewLineCVetorizer.get_feature_names()
        try:
            FeautureImportances = Model.model["mu"][1.0].toarray()[0][:-1]
        except:
            FeautureImportances = Model.model["mu"].toarray()[0]
        TopFeatureIndices = FeautureImportances.argsort()[-100:][::-1]
        for FIndex in TopFeatureIndices:
            print >> f, Vocab[FIndex], FeautureImportances[FIndex]
        print >> f, '-' * 100

        print >> f, 'before deleting rows TestFVs.shape', TestFVs.shape
        for i in xrange(len(TestSamples)):
            if -1 == TestLabels[i]:
                TestFVss = TestFVs[:i, :]
                break
        print >> f, 'after deleting rows TestFVs.shape', TestFVss.shape

        FeatureImportancesSparseArray = ssp.lil_matrix(
            (TestFVss.shape[1], TestFVss.shape[1]))
        FeatureImportancesSparseArray.setdiag(FeautureImportances)
        AllFVsTimesW = TestFVss * FeatureImportancesSparseArray

        print >> f, '-' * 100
        AvgFV = AllFVsTimesW.mean(axis=0)
        AvgFV = AvgFV.view(dtype=np.float64).reshape(AvgFV.shape[1], -1)
        AvgFV = np.array(AvgFV).reshape(-1, )
        TopRes = AvgFV.argsort()[-100:][::-1]
        print >> f, 'Top Feats of Test Positive Vector * Feature Importance Vector'
        for Sindex in TopRes:
            print >> f, Vocab[Sindex], AvgFV[Sindex]
        print >> f, '-' * 100
#https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
essay_cols = ["essay0", "essay1", "essay2", "essay3", "essay4", "essay5", "essay6", "essay7", "essay8", "essay9"]

# Copied from capstone instruction to format the text data, remove NaN, lowercase and non-text values.
all_essays = DF[essay_cols].fillna('')
all_essays = all_essays[essay_cols].apply(lambda x: ' '.join(x), axis=1)
all_essays = all_essays.replace(regex=r'[\W]', value=' ').replace(regex=r'[\s_]+', value=' ')

#Word lists that I have created based on my really poor model of reality what higher/lower income people would say
list_lower_half = ['school', 'college', 'student', 'careless', 'sex', 'sexy', 'stupid', 'butt', 'ass', 'asshole', 'youtube', 'youtuber', 'boring', 'bored', 'lame', 'bad', 'video game', 'video games', 'vidya gaems', 'dream', 'dreaming', 'dreamer', 'chill', 'shit', 'annoying', 'mom', 'mother', 'basement', 'unemployed', 'bullshit', 'drinking', 'party', 'stress', 'stressful', 'hate', 'desperate', 'difficult', 'troubled', 'fun', 'sarcastic', 'f*****g', 'f***a', 'fack', 'f****r', 'f***s', 'procrastination']
list_upper_half = ['work', 'working', 'workaholic', 'career', 'confident', 'smart', 'learn', 'learning', 'improve', 'improving', 'bank', 'banking', 'intelligence', 'intelligent', 'manager', 'competitive', 'competition', 'corp', 'corporation', 'house', 'adventure', 'adventurer', 'solve', 'solving', 'geek', 'nerd', 'optimistic', 'restaurant', 'restaurants', 'travel', 'travelling', 'employed', 'teach', 'teaching', 'honest', 'children', 'help', 'exploring', 'parent', 'son', 'daughter']

vectorizer = CountVectorizer()

vectorizer.fit(list_lower_half)
word_array = vectorizer.transform(all_essays).toarray()
counts_lower = pd.DataFrame(word_array, columns=vectorizer.get_feature_names()).sum(axis=1)
print(counts_lower)

vectorizer.fit(list_upper_half)
word_array = vectorizer.transform(all_essays).toarray()
counts_higher = pd.DataFrame(word_array, columns=vectorizer.get_feature_names()).sum(axis=1)
print(counts_higher)
#https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
#Income vs Type of Word
DF2 = pd.DataFrame()
DF2['income2'] = DF['income']
DF2['income2'] = DF2[DF2.income2 != -1]
DF2['lower_income_words'] = counts_lower
DF2['higher_income_words'] = counts_higher
DF2 = DF2.dropna()
test_case = load_files('reuter2/training')

count_vect = CountVectorizer(decode_error='ignore', strip_accents='unicode')
X_train_counts = count_vect.fit_transform(test_case.data)
X_train_counts.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

clf = MultinomialNB().fit(X_train_tfidf, test_case.target)
docs_new = [
    'I like bees',
    'Construction of a unique downtown highrise that would provide both living and working space to local artists is still at least a year away from starting, project organizers say.'
]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, test_case.target_names[category]))

text_clf = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_df': (0.5),
Example #59
0
one_hot = pd.get_dummies(data["Polarity_Rating"])
data.drop(['Polarity_Rating'], axis=1, inplace=True)
df_new = pd.concat([data, one_hot], axis=1)
df_new.head()

X = df_new['Tweet'].values
y = df_new.drop('Tweet', axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

model = Sequential()

model.add(Dense(units=12673, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=4000, activation='relu'))
model.add(Dropout(0.5))
Example #60
0
print(len(twenty_test.data))
print(twenty_train.target_names)
print("\n".join(twenty_train.data[0].split("\n")))
print(twenty_train.target[0])
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
mod = MultinomialNB()
mod.fit(X_train_tfidf, twenty_train.target)
X_test_tf = count_vect.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = mod.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(twenty_test.target, predicted))
print(
    classification_report(twenty_test.target,
                          predicted,
                          target_names=twenty_test.target_names))
print("confusion matrix is \n", confusion_matrix(twenty_test.target,
                                                 predicted))
"""
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
2257
1502
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']