Ejemplo n.º 1
0
def recommend(search_word):
    movie_df = pre_process()

    tfv = vectorizer(min_df=3,
                     max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3),
                     stop_words='english')
    tfv_matrix = tfv.fit_transform(movie_df['bow'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    index = pd.Series(movie_df.index,
                      index=movie_df['original_title']).drop_duplicates()

    try:
        idx = index[search_word]
        sig_scores = list(enumerate(sig[idx]))
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
        sig_scores = sig_scores[1:15]
        movie_indices = [i[0] for i in sig_scores]

        return list(movie_df['original_title'].iloc[movie_indices])
    except:
        return None
Ejemplo n.º 2
0
 def __init__(self, doc_dict):
     #self._n_features = 100000
     self._tfidf_vectorizer = vectorizer(tokenizer=self.tokens,
                                         stop_words='english')
     self._doc_term_matr = self._tfidf_vectorizer.fit_transform(
         doc_dict.values())
     #self.vect_length = len(self._tfidf_vectorizer.vocabulary_)
     self.vect_length = self._doc_term_matr.shape[1]
     #self._vect_list = []
     self._id_list = []
     for i, k in enumerate(doc_dict):
         #self._vect_list.append(self._doc_term_matr[i])
         self._id_list.append(k)
Ejemplo n.º 3
0
def process():
    global movie_df, sig, index
    movie_df = pre_process()
    tfv = vectorizer(min_df=3,
                     max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3),
                     stop_words='english')
    tfv_matrix = tfv.fit_transform(movie_df['overview'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    index = pd.Series(movie_df.index,
                      index=movie_df['title']).drop_duplicates()
Ejemplo n.º 4
0
def recommend(search_word):

    movie_df = pre_process()
    tfv = vectorizer(min_df=3,
                     max_features=None,
                     strip_accents='unicode',
                     analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1, 3),
                     stop_words='english')
    tfv_matrix = tfv.fit_transform(movie_df['overview'])
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    index = pd.Series(movie_df.index,
                      index=movie_df['title']).drop_duplicates()

    try:
        title = search_word.lower()
        max_se = 0.0
        name = ''
        for i in list(movie_df['title']):
            se = sm(None, title, i)
            if (se.ratio() > max_se):
                name = i
                max_se = se.ratio()
        idx = index[name]
        if (type(idx) == pd.core.series.Series):
            print(type(idx) == pd.core.series.Series)
            idx = index[name]
            idx = idx[[random.randint(0, (len(idx) - 1))]]
            idx = idx[name]
        else:
            idx = index[name]
        sig_scores = list(enumerate(sig[idx]))
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
        sig_scores = sig_scores[1:15]
        movie_indices = [i[0] for i in sig_scores]
        return list(movie_df['title'].iloc[movie_indices])
    except:
        return None
Ejemplo n.º 5
0
print("Getting Contents From Test Files")
path = folder + test_folder + pos_folder
onlyfiles = listdir(path)
pro_test = [open(path + f, 'r').read() for f in listdir(path)]
path = folder + test_folder + neg_folder
neg_test = [open(path + f, 'r').read() for f in listdir(path)]


def bigram(text_file):
    text = open(text_file, 'r').read()
    tokens = nltk.word_tokenize(text)
    return list(nltk.bigrams(tokens))


vect = vectorizer()

#vect.set_params(tokenizer=tokenizer.tokenize)

# remove English stop words
vect.set_params(stop_words='english')
# include 1-grams and 2-grams
print("Making vocabulary")
vect.set_params(ngram_range=(2, 2))
X = vect.fit_transform(pro_train + neg_train)
print("Vocabulary Made")
X_Train = (X.toarray())
y = [1] * len(pro_train) + [0] * len(neg_train)
print("Making Counts for Test Data")
X_test = vect.transform(pro_test + neg_test)
bayes = MultinomialNB()
Ejemplo n.º 6
0
 def cluster(self):
     self.vector = vectorizer(use_idf=True)
     self.matrix = self.vector.fit_transform(self.clean)
     self.model = algorithm(n_clusters=self.clusters, n_init=1000)
     self.model.fit(self.matrix)
Ejemplo n.º 7
0
                                        lowercase=True))])),
                  ('clf', LinearSVC())])

chars = Pipeline([('features',
                   FeatureUnion([('char',
                                  tfidf(analyzer='char',
                                        ngram_range=(3, 6),
                                        binary=False,
                                        max_df=1.0,
                                        min_df=2,
                                        norm='l2',
                                        sublinear_tf=True,
                                        use_idf=True,
                                        lowercase=True))])),
                  ('clf', LinearSVC())])

simple = Pipeline([('features',
                    vectorizer(lowercase=False,
                               token_pattern=r'\b\w+\b',
                               ngram_range=(1, 2))),
                   ('clf', LogisticRegression())])


def neural():
    return (_ for _ in ()).throw(Exception('NotImplementedError'))


random = Pipeline([('features', tfidf()),
                   ('clf', DummyClassifier(strategy='uniform',
                                           random_state=42))])
Ejemplo n.º 8
0
                                                    norm='l2',
                                                    sublinear_tf=True,
                                                    use_idf=True,
                                                    lowercase=True))])),
                  ('clf', LinearSVC())])
                  
chars = Pipeline([('features', FeatureUnion([('char',
                                              tfidf(analyzer='char',
                                                    ngram_range=(3, 6),
                                                    binary=False,
                                                    max_df=1.0,
                                                    min_df=2,
                                                    norm='l2',
                                                    sublinear_tf=True,
                                                    use_idf=True,
                                                    lowercase=True))])),
                  ('clf', LinearSVC())])



simple = Pipeline([('features', vectorizer(lowercase=False,
                                           token_pattern=r'\b\w+\b',
                                           ngram_range=(1,2))),
                   ('clf', LogisticRegression())])



random = Pipeline([('features', tfidf()),
                   ('clf', DummyClassifier(strategy='uniform',
                                           random_state=42))])