Example #1
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
''' following defination is for n-grams where string is formated into n tuples '''


def ngrams_m(text, n=3):

    remove = string.punctuation
    remove = remove.replace("#", "")
    pattern = r"[{}]".format(remove)
    text = re.sub(pattern, r'', text)
    ngrams = zip(*[text[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


count_vect = CountVectorizer(analyzer=ngrams_m)
tfidf_transformer = TfidfTransformer()
le = LabelEncoder()
''' This is the cosine similarity model for percentage of similarity'''


def cosinesim(text1, text2):

    if isinstance(text2, list):
        cos = dict()
        for i in range(len(text2)):
            Targettxt = text2[i].lower()
            Sourcetxt = text1.lower()
            vect1 = [Sourcetxt, Targettxt]
            vect2 = count_vect.fit_transform(vect1)
            tfidf = tfidf_transformer.fit_transform(vect2)
Example #2
0
if __name__ == "__main__":

    #Load Dataset
    print("Loading dataset...")
    t0 = time()
    filename = '/home/acj03778/Desktop/Publication/Datasets/824.csv'
    dataset = loadCsv(filename)

    #Load Content
    content = separate_each_column(dataset)
    print("done in %0.3fs." % (time() - t0))

    #Tf Feature Extraction
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.98,
                                    min_df=2,
                                    max_features=n_features,
                                    stop_words='english')
    t0 = time()
    tf_content = tf_vectorizer.fit_transform(content)
    X = tf_content.toarray()

    print("done in %0.3fs." % (time() - t0))
    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))

    #LDA Model Definition
    lda = LatentDirichletAllocation(n_topics=n_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
from nltk.stem.porter import PorterStemmer
import re
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3) #3 is code ignore

corpus=[]
for i in range(0,len(dataset)):
    review=re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review=review.lower()
    review=review.split() 
    ps=PorterStemmer() 
    review=[ps.stem(word) for word in review if not word in nltk.corpus.stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500) #most relevent top 1500 
X=cv.fit_transform(corpus).toarray()
y=dataset.iloc[:,1].values



# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
dataset = all_lines
data_samples = dataset[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting features for NMF")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=n_features)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print(tfidf)
print("done in %0.3fs." % (time() - t0))

print("Extracting features for LDA")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fitting the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d" % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
Example #5
0
fig = plt.figure(figsize=(10, 5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Negative)")
plt.plot(x_val, y_final)
plt.show()

# In[26]:

from sklearn.feature_extraction.text import CountVectorizer

s1 = "Senate panel moving ahead with Mueller bill despite McConnell opposition"
s2 = "Bill protecting Robert Mueller to get vote despite McConnell opposition"

vect = CountVectorizer(binary=True)
X = vect.fit_transform([s1, s2])

X.toarray()

# In[27]:

list(zip(X.toarray()[0], vect.get_feature_names()))

# In[28]:

list(zip(X.toarray()[1], vect.get_feature_names()))

# In[29]:

vect = CountVectorizer(max_features=1000, binary=True)
# Generate a list of all combinations of categories, up to a max length
category_subsets = []
max_classes = 5
for L in range(1, max_classes + 1):
  for subset in itertools.combinations(categories, L):
    category_subsets.append(subset)
# Now make a look-up table for the index corresponding to a tuple of categories
subset_index = {}
for i, category_subset in enumerate(category_subsets):
    subset_index[category_subset] = i

if do_train_coarse:
    # Coarse classifier
    coarse_classifier = Pipeline([
            ('features', CountVectorizer(ngram_range=(1,2))),
            ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88,
                               n_estimators=200, n_jobs=cpu_count()-1)),
    ])
    # Fit coarse classifier
    print 'Fitting coarse classifier'
    coarse_classifier.fit(train.question, train.coarse_label)


if do_train_fine:
# Fine classifiers
    fine_classifiers = []
    for _ in range(len(category_subsets)):
        fine_classifier = Pipeline([
                ('features', CountVectorizer(ngram_range=(1,2))),
                ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88*2,
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups



newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target




text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])


text_clf.fit(X_train, y_train)




predicted = text_clf.predict(X_test)



Example #8
0
def test_non_unique_vocab():
    vocab = ['a', 'b', 'c', 'a', 'a']
    vect = CountVectorizer(vocabulary=vocab)
    assert_raises(ValueError, vect.fit, [])
def main():
    p = argparse.ArgumentParser()

    p.add_argument(
        '--train',
        help=
        'Name of training partition. "train" by default. This should be the name of a directory '
        'in "../data/essays/" as well as "../data/features/"',
        default='train')
    p.add_argument(
        '--test',
        help=
        'Name of the testing partition. "dev" by default. This should be the name of a directory '
        'in "../data/essays/" as well as "../data/features/"',
        default='dev')
    p.add_argument(
        '--preprocessor',
        help=
        'Name of directory with processed essay files. "tokenized" by default.',
        default='tokenized')
    p.add_argument(
        '--training_features',
        help=
        'Path to file containing precomputed training features. None by default. '
        'Should be located in ../data/features/<train_partition_name>/')
    p.add_argument(
        '--test_features',
        help=
        'Path to file containing precomputed test features. None by default.'
        'Should be located in ../data/features/<test_partition_name>/')
    p.add_argument(
        '--feature_outfile_name',
        help=
        'Custom name, if desired, for output feature files to be written to '
        '../data/features/essays/<train_partition_name>/ and '
        '../data.features/essays/<test_partition_name>. '
        'If none provided, feature files will be named using the date and time.'
        'If precomputed feature files are provided, this argument will be ignored.'
    )
    p.add_argument(
        '--predictions_outfile_name',
        help=
        'Custom name, if desired, for predictions file to be written to ../predictions/essays/.'
        'If none provided, predictions file will be names using the date and time.'
    )
    args = p.parse_args()
    train_partition_name = args.train
    test_partition_name = args.test
    preprocessor = args.preprocessor
    feature_file_train = args.training_features
    feature_file_test = args.test_features
    feature_outfile_name = args.feature_outfile_name
    predictions_outfile_name = args.predictions_outfile_name

    #
    # Define Vectorizer and Transformer
    #
    vectorizer = CountVectorizer(input="filename")
    transformer = Normalizer()  # Normalize frequencies to unit length

    #
    # Load the training and test features and labels
    #
    training_and_test_data = get_features_and_labels(train_partition_name,
                                                     test_partition_name,
                                                     feature_file_train,
                                                     feature_file_test,
                                                     baseline=BASELINE,
                                                     preprocessor=preprocessor,
                                                     vectorizer=vectorizer,
                                                     transformer=transformer)

    train_matrix, encoded_train_labels, original_training_labels = training_and_test_data[
        0]
    test_matrix, encoded_test_labels, original_test_labels = training_and_test_data[
        1]

    #
    # Write features to feature files if they are new
    #
    if not (feature_file_train and feature_file_test):
        write_feature_files(train_partition_name, feature_outfile_name,
                            BASELINE, train_matrix, encoded_train_labels)
        write_feature_files(test_partition_name, feature_outfile_name,
                            BASELINE, test_matrix, encoded_test_labels)

    #
    # Run the classifier
    #
    clf = LinearSVC()
    print("Training the classifier...")
    clf.fit(train_matrix, encoded_train_labels)  # Linear kernel SVM
    predicted = clf.predict(test_matrix)

    #
    # Write predictions and display report
    #
    write_predictions_file(predicted, test_partition_name,
                           predictions_outfile_name, BASELINE)
    display_classification_results(encoded_test_labels, predicted)
Example #10
0
def test_fit_countvectorizer_twice():
    cv = CountVectorizer()
    X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
    X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
    assert_not_equal(X1.shape[1], X2.shape[1])
Example #11
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5], [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3], [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_false(tv.fixed_vocabulary_)
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)
Example #12
0
def test_countvectorizer_custom_vocabulary_gap_index():
    vocab = {"pizza": 1, "beer": 2}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("doesn't contain index", str(e).lower())
Example #13
0
def test_countvectorizer_custom_vocabulary_repeated_indeces():
    vocab = {"pizza": 0, "beer": 0}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("vocabulary contains repeated indices", str(e).lower())
def baseline_classify():
    sentiment_corpus, sentiment_labels = get_sentiment_corpus_and_labels()
    sentiment_corpus = np.array(sentiment_corpus)
    sentiment_labels = np.array(sentiment_labels)
    X_train, X_test, Y_train, Y_test = train_test_split(sentiment_corpus,
                                                        sentiment_labels,
                                                        test_size=0.3,
                                                        random_state=10)
    #print(X_test.shape) (900,)
    #print(X_train.shape) (2100,)
    # this section adapted from https://towardsdatascience.com/sentiment-classification-with-logistic-regression-analyzing-yelp-reviews-3981678c3b44
    if args.kernel == 'count':
        cv = CountVectorizer(
            binary=True, analyzer='word', min_df=10, max_df=0.95
        )  # creates matrix of counts drops words in less than 10 docs or more than 95 percent of docs
    elif args.kernel == 'tfidf':
        cv = TfidfVectorizer()
    cv.fit_transform(X_train)  # returns document-term matrix
    pickle.dump(cv, open("tfidf.pickle", "wb"))
    train_feature_set = cv.transform(
        X_train
    )  # (2100, 4135) returns document-term matrix (represents frequency of terms in strings, rows are strings cols are terms)
    test_feature_set = cv.transform(
        X_test)  # (900, 4135) type is scipy.sparse.csr.csr_matrix
    # build the appropriate model
    if args.model == "logistic":
        lr = LogisticRegression(
            solver='liblinear', random_state=42,
            max_iter=1000)  # define classifier lr is simple for baseline
        lr.fit(train_feature_set, Y_train)  # fit model
        y_pred = lr.predict(test_feature_set)  # make predictions
        print("Accuracy: ", round(accuracy_score(Y_test, y_pred),
                                  7))  # get accuracy to 7 decimal places
        report = classification_report(Y_test, y_pred, output_dict=True)
        print('positive: ', report['1'])
        print('negative: ', report['0'])

        # set hyperparameters tuning grid
        param_grid = {
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
            'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
            'C': [0.25, 0.5, 1.0],
            'max_iter': [i for i in range(10, 5000, 20)]
        }
        model = LogisticRegression()

    elif args.model == "svm":
        # Perform classification with SVM, kernel=linear
        classifier_linear = svm.SVC(kernel='linear')
        classifier_linear.fit(train_feature_set, Y_train)
        y_pred = classifier_linear.predict(test_feature_set)
        print("Accuracy: ", round(accuracy_score(Y_test, y_pred), 7))
        report = classification_report(Y_test, y_pred, output_dict=True)
        print('positive: ', report['1'])
        print('negative: ', report['0'])

        # set hyperparameters tuning grid
        param_grid = {
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['auto', 1e-3, 100, 10, 'scale'],
            'tol': [0.0001, 0.001, 0.01],
            'C': [0.1, 1, 10, 100],
            'degree': [i for i in range(1, 500, 2)],
            'max_iter': [i for i in range(10, 5000, 20)],
            'probability': [True]
        }
        model = svm.SVC()

    else:
        rf = RandomForestClassifier(max_depth=2, random_state=0)
        rf.fit(train_feature_set, Y_train)
        y_pred = rf.predict(test_feature_set)
        print("Accuracy: ", round(accuracy_score(Y_test, y_pred), 7))
        report = classification_report(Y_test, y_pred, output_dict=True)
        print('positive: ', report['1'])
        print('negative: ', report['0'])

        # set hyperparameters tuning grid
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'n_estimators': [10, 100, 1000],
            'max_depth': [50, 100, 200, None],
            'min_samples_leaf': [2, 5]
        }
        model = RandomForestClassifier()

    # Perform a hyperparameter search using training data
    hp_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   random_state=12345,
                                   cv=5)
    # hp_search = BaggingClassifier(base_estimator=hp_search, n_estimators=10, random_state=0)
    #hp_search = GridSearchCV(estimator=model, param_grid=param_grid)
    hp_search = hp_search.fit(train_feature_set, Y_train)
    predictions = hp_search.predict(test_feature_set)
    # save model
    if args.model == "svm":
        dump(hp_search, 'models/svm.joblib')
    elif args.model == "rf":
        dump(hp_search, 'models/rf.joblib')
    #print('tuned parameters: {}'.format(hp_search.best_params_))
    print('tuned parameters: {}'.format(hp_search.get_params()))
    #print('best score is {}'.format(hp_search.best_score_))
    print('best score is {}'.format(hp_search.score(test_feature_set, Y_test)))

    # np.savetxt(args.predictions_file, predictions)

    with open(args.predictions_file, 'w+') as w:
        w.write('sentence' + '\t' + 'prediction' + '\t' + 'true label' + '\n')
        for i in range(len(predictions)):
            if predictions[i] != Y_test[i]:
                w.write(X_test[i] + '\t')
                w.write(predictions[i] + '\t')
                w.write(Y_test[i] + '\n')

    return hp_search
Example #15
0
def suitabledata(group):
    df1 = df.get_group(group)
    df1 = df1.drop_duplicates()


#def bagofwords(group):

df1 = pd.read_csv("log.csv")
df = pd.read_csv('log1.csv')
#df = df.drop_duplicates()
#np1= np.array()
#for i in set(df['JD'].values):
#     if(i!= 'JD'):
#       a = suitabledata(i)
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(df1['Qualifications'].values)
lis = list()
classifier = MultinomialNB()
targets = df1['JD'].values
classifier.fit(counts, targets)

a = pdfcontent(
    "/home/shubhi/09ac1673-6c31-43ab-891c-d8fa1b736c67-170105194520.pdf")
lis.append(a)
a1 = pdfcontent("/home/shubhi/Shubham's Resume (13).pdf")
lis.append(a1)
b = checkJD(lis)
print(b)
#np1=np.array(b)
#np1 = np1.reshape(2,1)
def feature_extraction(X_train):
    vectorizer = CountVectorizer(ngram_range=(1, 1),max_features=10000)
    vectorizer.fit_transform(X_train)
    return vectorizer
Example #17
0
    def get(self, Query):
        mg = MagicGoogle()
        urls = []
        search = str(Query + ' language:english file:html')
        print(search)
        for url in mg.search_url(query=search):
            urls.append(str(url))
        tel = len(urls)
        pool = ThreadPool(tel)
        result = pool.map(get_web_data, urls)
        df1 = pd.DataFrame(result)
        df1 = df1[df1['tekst'].notnull()]
        print(len(df1.index))
        tekst = df1.tekst.values.tolist()
        df1.drop(['tekst'], axis=1, inplace=True)
        #cat = df1['label'].values
        aantal = len(tekst)
        #print(aantal)
        #print(aantal)

        n_samples = 5000
        n_features = 2000
        n_components = aantal
        n_top_words = 5

        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=n_features,
                                        stop_words='english')
        tf = tf_vectorizer.fit_transform(tekst)
        tf_feature_names = tf_vectorizer.get_feature_names()
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=30,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)

        lda.fit(tf)
        ozzy = []

        def print_top_words(model, feature_names, n_top_words):
            # ozzy = []
            for topic_idx, topic in enumerate(model.components_):
                oz = (" ".join([
                    feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]
                ]))
                ozzy.append(oz)

        print_top_words(lda, tf_feature_names, n_top_words)
        df1['topic'] = ozzy

        true_k = int(aantal * 0.3)
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1)
        km.fit(tf)
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = tf_vectorizer.get_feature_names()
        jk = []
        for i in range(true_k):
            j = []
            jk.append(j)
            for ind in order_centroids[i, :7]:
                za = str(' %s' % terms[ind])
                j.append(za)

        cols = {'clusters': jk}
        df2 = pd.DataFrame.from_dict(cols)
        df2['clusters'] = df2['clusters'].astype(str).str.replace(
            r"[\[\]']", '')  # terms
        df2.insert(0, 'clusterid', range(0, 0 + len(df2)))

        labels = km.labels_
        df1['clusterid'] = labels
        dfs = pd.merge(df1, df2)
        dfs.drop(['clusterid'], axis=1, inplace=True)
        multiS = dfs.to_dict()
        return multiS
        # from sklearn.naive_bayes import MultinomialNB
        # # clf = MultinomialNB().fit(tf, cat)
        # from sklearn.externals import joblib
        # # joblib.dump(clf, 'filename.pkl')
        # clf = joblib.load('filename.pkl')
        # # ttf = tf
        # cats = clf.predict(tf)
        # # # acc=np.mean(predicted == cat)
        # cm = metrics.confusion_matrix(cat, predicted)
        # sim = cosine_similarity(tf)
        '''Fetch a multi given its Queryentifier'''
        api.abort(404)
Example #18
0
        d1_training_X.append(row[1])
        d1_training_Y.append(row[2])

for file in d2_files:
    dataframe = pd.read_csv(file, header=0)
    for index, row in dataframe.iterrows():
        d2_training_X.append(row[1])
        d2_training_Y.append(row[4])

for file in sentiment_files:
    dataframe = pd.read_csv(file, header=0)
    for index, row in dataframe.iterrows():
        sentiment_X.append(row[5])
        sentiment_Y.append(row[0])

tweet_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()), ('clf', svm.LinearSVC())])

sentiment_clf = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', svm.LinearSVC())])

tweet_clf_extra = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', svm.LinearSVC())])

tweet_clf.fit(d1_training_X, d1_training_Y)
tweet_clf_extra.fit(d2_training_X, d2_training_Y)
sentiment_clf.fit(sentiment_X, sentiment_Y)

Example #19
0
test_questions = []
with open('labeler_sample.in', 'r') as f:
    first_line = f.readline().strip().split(" ")
    num_training, num_testing = int(first_line[0]), int(first_line[1])
    for index, line in enumerate(f):
        if index < num_training * 2:
            if index % 2 == 0:
                topics.append(map(int, line.strip().split(" ")))
            else:
                questions.append(line.strip())
        else:
            test_questions.append(line.strip())

mlb = MultiLabelBinarizer()
topics = mlb.fit_transform(topics)
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()


def top_10_elements_helper(array):
    ind = np.argpartition(array, -10)[-10:]
    return ind[np.argsort(array[ind])]


parameters = {
    "C": [0.5, 1., 2., 3., 4., 5.],
    "class_weight": ['auto', 'balanced'],
    "k": [500, 750, 1000, 1500, 2000, 2500]
}

scores = []
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

train_df['features'] = train_df['features'].apply(lambda x: x.lower())
train_df['features'] = train_df['features'].apply(lambda x: feature_map_func(x))
print(train_df["features"].head())

test_df['features'] = test_df['features'].apply(lambda x: x.lower())
test_df['features'] = test_df['features'].apply(lambda x: feature_map_func(x))
print(test_df["features"].head())

train_df["meanColorSpace"].head()
train_df = train_df.fillna(0)   
test_df = test_df.fillna(0)   

tfidf = CountVectorizer(stop_words='english', max_features=100)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

print(tr_sparse.shape)
print(te_sparse.shape)

#Stfidf = CountVectorizer(stop_words='english', max_features=200)
#tr_Ssparse = Stfidf.fit_transform(train_df[ "street_address"])
#te_Ssparse = Stfidf.transform(test_df[ "street_address"])
#
#print(tr_Ssparse.shape)
#print(te_Ssparse.shape)

#train_X = sparse.hstack([train_df[features_to_use], tr_sparse, tr_Ssparse]).tocsr()
#test_X = sparse.hstack([test_df[features_to_use], te_sparse,te_Ssparse]).tocsr()
Example #21
0

def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(
        x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

meta = metadata.head(10000)

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(meta['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
meta = meta.reset_index()
indices = pd.Series(meta.index, index=meta['title'])

title = ''


def get_recommendations(title, cosine_sim=cosine_sim2):
Example #22
0
 def __init__(self, stopwords=None):
     self.stopwords = stopwords
     self.vectorizer = CountVectorizer()
     self.transformer = TfidfTransformer()
Example #23
0
from sklearn.datasets import fetch_20newsgroups

Training_data = fetch_20newsgroups(subset='train', shuffle=True)
Training_data.target_names

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

svm_classification = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
])
svm_classification = svm_classification.fit(Training_data.data,
                                            Training_data.target)

import numpy as np

Testing_data = fetch_20newsgroups(subset='test', shuffle=True)
svm_prediction = svm_classification.predict(Testing_data.data)
print("Accuracy of Support Vector Machine in percentage :",
      np.mean(svm_prediction == Testing_data.target) * 100)
len(clean_reviews_final)
# Transforming Reviews into DOCUMENT-TERM-MATRIX using CountVectorizer.

clean_reviews_series = clean_reviews_final.without_stopwords  #vectorizer needs a series object.

## N gram Analysis
#Note: CountVectorizer can handle Ngrams using 'ngram_range' arguement.
"""max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
max_df = 25 means "ignore terms that appear in more than 25 documents"""
"""min_df is used for removing terms that appear too infrequently. For example:
min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
min_df = 5 means "ignore terms that appear in less than 5 documents"."""

vectorizerng = CountVectorizer(
    ngram_range=(1, 2),
    min_df=0.01)  #  one and two grams( i.e unigrams and bigrams).

document_term_matrix_ng = vectorizerng.fit_transform(
    clean_reviews_series)  # DOCUMENT-TERM-MATRIX, page 15 in LMS

document_term_matrix_ng = pd.DataFrame(
    document_term_matrix_ng.toarray(),
    columns=vectorizerng.get_feature_names())  # DTM to Dataframe.

document_term_matrix_ng.shape

document_term_matrix_ng.head(10)

#word cloud frequencies of words
words = dict(document_term_matrix_ng.apply(
    for ingr in item:
        ingr.lower()
        ingr = re.sub("[^a-zA-Z]", " ", ingr)
        ingr = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ',
                      ingr)
        newitem.append(ingr)
    featurs_test_processed.append(newitem)

# Binary representation of the training set will be employed

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    analyzer="word",
    ngram_range=(1, 1),  # unigrams
    binary=True,  # count default there
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_df=0.99)  # >99, discard
train_X = vectorizer.fit_transform([str(i) for i in featurs_processed])
test_X = vectorizer.transform([str(i) for i in featurs_test_processed])

target_val = train_data['cuisine']

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
train_Y = lb.fit_transform(target_val)

from sklearn.model_selection import train_test_split
Example #26
0
import pickle

corpus=pd.read_csv(r"C:\temp\\training.csv",encoding='latin-1')
corpus['text'] = [entry.lower() for entry in corpus['text']]

#instantiate CountVectorizer()

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(corpus['text'], corpus['label'], test_size=0.10)

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
fi=open( 'c:\\temp\\vocabulary.pkl','rb')
vocabulary=pickle.load(fi)

count_vect=CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', vocabulary = vocabulary)
print(count_vect.vocabulary)
xvalid_count =  count_vect.transform(valid_x)


fi2=open( 'c:\\temp\\tfidfvocabulary.pkl','rb')
tfidf_vect=pickle.load(fi2) 

#tfidf_vect.vocabulary_=pll
#print(vocabulary_)

filename = 'c:\\temp\\svm_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', vocabulary =vocabulary, max_features=5000)

print(tfidf_vect.vocabulary_)
Example #27
0
def get_count_feat(sents, max_features=2000):
    stop_words = get_stop_words()
    ct = CountVectorizer(max_features=max_features, stop_words=stop_words, binary=False)
    feat = ct.fit_transform(sents)
    feat = feat.toarray()
    return feat
    no_punctuation = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    no_punctuation = ''.join(no_punctuation)

    # Now just remove any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

# creating character count feature as "char_count"
sms_dataset['char_count'] = sms_dataset.texts.apply(len)
#sms_dataset['texts'] = sms_dataset['texts'].apply(text_process)

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(sms_dataset['texts'])
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

# Let's take one text message and get its bag-of-words counts as a vector, putting to use our new bow_transformer:
text_4 = sms_dataset['texts'][3]
print('Text at index 4 {}'.format(text_4))

# Now let's see its vector representation:
bow_4 = bow_transformer.transform([text_4])
print(bow_4)
print(bow_4.shape)

'''
output of above shows, that there are seven unique words in message number 4 (after removing common stop words). 
Two of them appear twice, the rest only once. 
corpus = []  #creating empty list it will contain all the 1000 cleaning reviews
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model contain all different unique words and one column for each words so basically we will get table rows corrospond to reviews  and column different words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,
                                    criterion='entropy',
                                    random_state=0)
# In[6]:


X_train, X_test, y_train, y_test = train_test_split(dftext['text'], y, test_size=0.33, random_state=53)


# # building vectorizer classifiers
# 

# In[7]:


# Initialize the `count_vectorizer`   
# ngram_range=(1, 2) means include 1-grams and 2-grams  
count_vectorizer = CountVectorizer(stop_words='english',lowercase=True,ngram_range=(1, 2)) 

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)


# In[8]:


# Initialize the `tfidf_vectorizer` 
# The parameter of use_idf=True enables inverse-document-frequency reweighting by taking the log of the ratio of the 
# total number of documents to the number of documents contacting the term. And smooth_idf=True adds 1 to document 
# frequencies to avoid division by zero,so we can measure the frequency of a term among documents for