def check_review(reviewText):  # Function for Checking the User Entered Reviews
    transformer = tfidfT()
    loaded_vec = tfidfV(decode_error="replace", vocabulary=vocab)
    reviewText = transformer.fit_transform(
        loaded_vec.fit_transform([reviewText]))

    return pickle_model.predict(reviewText)
Esempio n. 2
0
#2nd: function of puncutuation number to apply on every description
def punct_percentage(description):
    count = sum([1 for symbol in description if symbol in string.punctuation
                 ])  # string.punctuation defined above
    return 100 * round(count / (len(description) - description.count(" ")), 3)


train['punctuation-percentage'] = train['Description'].apply(
    lambda des: punct_percentage(des))

print(train.head(3))
# ------------------------------------------------------------------------------------
# Exploring parameters using GridSearchCV
# TF-IDF
tfidf_vectorizer = tfidfV(analyzer=cleandata)
X_tfidf = tfidf_vectorizer.fit_transform(train['Description'])
X_tfidf_clf = pd.concat([
    train['length'], train['punctuation-percentage'],
    pd.DataFrame(X_tfidf.toarray())
],
                        axis=1)

# CountVectorizer
cv_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer=cleandata)
X_cv = cv_vectorizer.fit_transform(train['Description'])
X_cv_clf = pd.concat([
    train['length'], train['punctuation-percentage'],
    pd.DataFrame(X_cv.toarray())
],
                     axis=1)
Esempio n. 3
0
        ]).split()
        if aList == ['deleted']:
            continue
        aList = [c for c in aList if 'http' not in c]
        aStr += ' '.join(aList)
        aStr += " "
    pd = postDate[cmt]
    cmtsDate[pd] += aStr

#make tfidf vectorizer
cmts = zip(cmtsDate.keys(), cmtsDate.values())
cmts.sort()
cmtsOnly = zip(*cmts)[1]

tfV = tfidfV(ngram_range=NGRAM_RNG,
             max_features=MAX_FEAT,
             stop_words='english',
             strip_accents='unicode')
tfidfData = tfV.fit_transform(cmtsOnly).toarray()


#1, # of upvotes, todays price, tfidf[:2000]
def feature(date):

    feat = [1]
    feat.append(upvotes[date])
    feat.append(float(prices[date]['Open']))
    #feat.append(float(prices[date]['Close']))
    feat += tfidfData[Xdates.index(date)].tolist()
    return feat

Esempio n. 4
0
    lTrainY = dfTrain["cat"].apply(lambda x: dictLabToId[x])
    lTestY = dfTest["cat"].apply(lambda x: dictLabToId[x])

    del lTrainH, lTrainS, lTestH, lTestS, lTest, lTrain

    #Create Count Vectorizer
    countVect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    countVect.fit(dfTrain["email"])

    cvTrainX = countVect.transform(dfTrain["email"])
    cvTestX = countVect.transform(dfTest["email"])

    #Word level tf-idf
    tfidfVect = tfidfV(analyzer='word',
                       token_pattern=r'\w{1,}',
                       max_features=5000)
    tfidfVect.fit(dfData["email"])
    XtrV = tfidfVect.transform(dfTrain["email"])
    XteV = tfidfVect.transform(dfTest["email"])

    #Ngram level tf-idf
    tfidfVectNgram = tfidfV(analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(2, 3),
                            max_features=5000)
    tfidfVectNgram.fit(dfData["email"])
    XtrVN = tfidfVectNgram.transform(dfTrain["email"])
    XteVN = tfidfVectNgram.transform(dfTest["email"])

    #Characters level tf-idf