def get_name_features(names):

    name = []
    for i in names:
        s = re.findall('(?i)[a-z]{2,}', i)
        name.append(' '.join(s))

    cv = CV(analyzer='char_wb', ngram_range=(3, 4))
    fn = cv.fit_transform(name).toarray()

    return fn
    def __init__(self,
                 vectorizer=None,
                 clf_model=None,
                 use_tokens=False,
                 forecast_attribute_name: str = "prediction",
                 forecast_prob_attribute_name: str = "score"):

        super().__init__(
            forecast_attribute_name=forecast_attribute_name,
            forecast_prob_attribute_name=forecast_prob_attribute_name)
        if vectorizer is None:
            print("Initializing default unigram CountVectorizer...")
            if use_tokens:
                self.vectorizer = CV(decode_error='ignore',
                                     min_df=10,
                                     max_df=.5,
                                     ngram_range=(1, 1),
                                     binary=False,
                                     max_features=15000,
                                     tokenizer=lambda x: x,
                                     preprocessor=lambda x: x)
            else:
                self.vectorizer = CV(decode_error='ignore',
                                     min_df=10,
                                     max_df=.5,
                                     ngram_range=(1, 1),
                                     binary=False,
                                     max_features=15000)
        else:
            self.vectorizer = vectorizer

        if clf_model is None:
            print(
                "Initializing default classification model (standard scaled logistic regression)"
            )
            self.clf_model = Pipeline([
                ("standardScaler", StandardScaler(with_mean=False)),
                ("logreg", LogisticRegression(solver='liblinear'))
            ])
        else:
            self.clf_model = clf_model
Ejemplo n.º 3
0
def string_vectorize(Xs_list):
    vc = CV(analyzer='char_wb',
            ngram_range=(3, 4),
            min_df=1,
            token_pattern='[a-z]{2,}')
    name = []
    for i in Xs_list:
        s = re.findall('(?i)[a-z]{2,}', "".join(str(x) for x in i))
        name.append(' '.join(s))
    vc.fit(name)
    vec = vc.transform(name).toarray()
    dictionary = vc.get_feature_names()
    return vec, dictionary
    def __init__(self,
                 obj_type="utterance",
                 text_func=None,
                 cv=None,
                 ngram_range=None,
                 prior=0.1,
                 class1_attribute_name='fighting_words_class1',
                 class2_attribute_name='fighting_words_class2'):
        assert obj_type in ["speaker", "utterance", "conversation"]
        self.obj_type = obj_type

        if text_func is None:
            if obj_type == 'utterance':
                self.text_func = lambda utt: FightingWords.clean_text(utt.text)
            elif obj_type == 'conversation':
                self.text_func = lambda convo: \
                    FightingWords.clean_text(' '.join([utt.text for utt in convo.iter_utterances()]))
            else:
                self.text_func = lambda spkr: \
                    FightingWords.clean_text(' '.join([utt.text for utt in spkr.iter_utterances()]))
        else:
            self.text_func = text_func

        self.ngram_range = ngram_range
        self.prior = prior
        self.cv = cv
        self.ngram_zscores = None
        self._count_matrix = None
        if self.cv is None and type(self.prior) is not float:
            raise ValueError(
                "If using a non-uniform prior, you must pass a count vectorizer with "
                "the vocabulary parameter set.")
        if self.cv is None:
            print("Initializing default CountVectorizer", end=" ")
            if self.ngram_range is None:
                self.ngram_range = (1, 3)
            print("with ngram_range {}...".format(self.ngram_range), end=" ")
            self.cv = CV(decode_error='ignore',
                         min_df=10,
                         max_df=.5,
                         ngram_range=self.ngram_range,
                         binary=False,
                         max_features=15000)
            print("Done.")

        self.class1_attribute_name = class1_attribute_name
        self.class2_attribute_name = class2_attribute_name
Ejemplo n.º 5
0
 def __init__(self):
     self.db = self.connectDB()
     self.colList = [
         "api_busan",
         "api_herald",
         "api_nocut",
         "api_ohmynews",
         "api_wikitree",
         "api_donga",
         "api_hangook",
         "api_joseon",
         "api_yeonhap",
         "api_joongang",
     ]
     self.Threshold = 1.7
     self.data = []
     self.newData = []
     self.okt = Okt()
     self.vectorizer = CV(min_df=1)
     self.contents_all = []
Ejemplo n.º 6
0
def load_data():

    #loading in file
    allData = open("patientdata.txt")
    data = allData.readlines()
    allData.close()

    cv = CV()

    data = cv.fit_transform(data)
    fitted = data.toarray()
    fitted = np.column_stack((fitted, labels))

    training, validation, test = split(fitted)
    trainingLabel = training[:, -1]
    training = training[:, :-1]
    validationLabel = validation[:, -1]
    validation = validation[:, :-1]
    testLabel = test[:, -1]
    test = test[:, :-1]
    return training, validation, test, trainingLabel, validationLabel, testLabel, cv
    def __init__(self,
                 obj_type: str,
                 vectorizer=None,
                 vector_name="bow_vector",
                 text_func: Callable[[CorpusObject],
                                     str] = lambda utt: utt.text):

        if vectorizer is None:
            print("Initializing default unigram CountVectorizer...")
            self.vectorizer = CV(decode_error='ignore',
                                 min_df=10,
                                 max_df=.5,
                                 ngram_range=(1, 1),
                                 binary=False,
                                 max_features=15000)
        else:
            self.vectorizer = vectorizer

        self.obj_type = obj_type
        self.vector_name = vector_name
        self.text_func = text_func
    def __init__(self, class1_selector: Callable[[Utterance], bool],
                 class2_selector: Callable[[Utterance], bool], cv=None,
                 ngram_range=None, prior=0.1, threshold=1, top_k=10, annot_method="top_k",
                 string_sanitizer=lambda str_: FightingWords.clean_text(str_)):
        """

        :param class1_selector: selector function for identifying utterances that belong to class 1
        :param class2_selector: selector function for identifying utterances that belong to class 2
        :param cv: optional CountVectorizer. default: an sklearn CV with min_df=10, max_df=.5, and ngram_range=(1,3) with max 15000 features
        :param ngram_range: range of ngrams to use if using default cv
        :param prior: either a float describing a uniform prior, or a vector describing a prior
        over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
        when you make your CountVectorizer object.
        :param threshold: the z-score threshold for annotating utterances with identified ngrams
        :param top_k: the top_k threshold for which ngrams to annotate utterances with
        :param annot_method: "top_k" or "threshold" to specify which annotation method to use in transform() and
        :param string_sanitizer: optional function for cleaning strings prior to fighting words analysis: uses default
        string sanitizer otherwise
        """
        self.class1_selector = class1_selector
        self.class2_selector = class2_selector
        self.ngram_range = ngram_range
        self.prior = prior
        self.cv = cv
        self.threshold = threshold
        self.top_k = top_k
        assert annot_method in ["top_k", "threshold"]
        self.annot_method = annot_method
        self.ngram_zscores = None
        self.string_sanitizer = string_sanitizer
        self._count_matrix = None
        if self.cv is None and type(self.prior) is not float:
            raise ValueError("If using a non-uniform prior, you must pass a count vectorizer with "
                             "the vocabulary parameter set.")
        if self.cv is None:
            print("Initializing default CountVectorizer...")
            if self.ngram_range is None:
                self.ngram_range = (1, 3)
            self.cv = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=self.ngram_range,
                         binary=False, max_features=15000)
def score_word_similiarity(i):
    seg_model = CV(binary=True,
                   min_df=1,
                   ngram_range=(1, n_gram_size_2),
                   max_features=2000,
                   lowercase=lowercase,
                   tokenizer=tokenizer,
                   token_pattern=token_pattern)
    vec_seg = seg_model.fit_transform(segments_sen[i]).toarray()
    similiarity_index = 0
    seg_size = len(segments_sen[i])
    print "here", seg_size - neighbour_limit, is_pure_seg[i],
    for j in range(seg_size - neighbour_limit):
        temp_similiarity_index = 0
        for k in range(j, j + neighbour_limit):
            similiarity_index += sum(x[0] * x[1]
                                     for x in zip(vec_seg[j], vec_seg[k]))
            temp_similiarity_index += sum(x[0] * x[1]
                                          for x in zip(vec_seg[j], vec_seg[k]))
        print temp_similiarity_index,
    print "end"
    if similiarity_index > threshold_2:
        pure_segments.append(segments[i])
        pure_data.extend(segments_sen[i])
    else:
        mixed_segments.append(segments[i])
        mixed_data.extend(segments_sen[i])
    if is_pure_seg[i] == True:
        score_true.append(similiarity_index)
        if similiarity_index > threshold_2:
            pure.append(1)
        else:
            mixed.append(0)
    else:
        score_false.append(similiarity_index)
        if similiarity_index > threshold_2:
            pure.append(0)
        else:
            mixed.append(1)
Ejemplo n.º 10
0
    def genvec_from_features(self, features):
        # ベクトル化
        for em in features:
            self.vectors.append(" ".join(em))
        #cv = CV(min_df=0.1, max_df=0.9)
        cv = CV()
        #tfidf = TFIDF()
        tf = cv.fit_transform(self.vectors)
        #matrix = tfidf.fit_transform(tf).toarray()
        matrix = tf.toarray()
        # ベクトルを正規化
        x = np.array(list(map(np.linalg.norm, matrix)))
        # x = 0となる場合div zeroエラーが発生する.そもそも0ということは,特徴が存在していないためその行を削除する.
        nonzero = (x != 0)
        n_matrix = (matrix[nonzero].T / x[nonzero]).T
        self.dirlist = list(np.array(self.dirlist)[nonzero])
        # 主成分分析する
        pca = PCA(n_components=self.pca_ncomponents)
        pca.fit(n_matrix)
        # 分析結果を元にデータセットを主成分に変換する
        transformed = pca.fit_transform(n_matrix)

        return self.dirlist, transformed
Ejemplo n.º 11
0
    def __init__(self,
                 obj_type: str,
                 vector_name="bow_vector",
                 text_func: Callable[[CorpusComponent], str] = None,
                 vectorizer=None):

        if vectorizer is None:
            print("Initializing default unigram CountVectorizer...", end="")
            self.vectorizer = CV(decode_error='ignore',
                                 min_df=10,
                                 max_df=.5,
                                 ngram_range=(1, 1),
                                 binary=False,
                                 max_features=15000)
            print("Done.")
        else:
            self.vectorizer = vectorizer

        self.obj_type = obj_type
        self.vector_name = vector_name

        if text_func is None:
            if obj_type == "utterance":
                self.text_func = lambda utt: utt.text
            elif obj_type == "conversation":
                self.text_func = lambda convo: " ".join(
                    utt.text for utt in convo.iter_utterances())
            elif obj_type == "speaker":
                self.text_func = lambda speaker: " ".join(
                    utt.text for utt in speaker.iter_utterances())
            else:
                raise ValueError(
                    "Invalid corpus object type. Use 'utterance', 'conversation', or 'speaker'"
                )
        else:
            self.text_func = text_func
Ejemplo n.º 12
0
    for sentence in sentences:
        doc = sentence.lower()  #lower case
        doc = re.findall(r'[a-zA-Z]+',
                         sentence)  #removing numbers and special characters
        doc = [w for w in doc if not w in stop]  #removing stopwords
        doc = " ".join(doc)
        stemmer = SnowballStemmer('english')
        clean = stemmer.stem(doc)
        clean_text.append(clean)
    return clean_text


clean_text = cleaner(text)

from sklearn.feature_extraction.text import CountVectorizer as CV
cv = CV(ngram_range=(0, 2), encoding='latin', max_features=20000)
X = cv.fit_transform(clean_text)
features = cv.get_feature_names()
dtm = pd.DataFrame(X.toarray(), columns=features)
print dtm.shape

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dtm,
                                                    tags,
                                                    test_size=0.1,
                                                    random_state=40)

from sklearn.ensemble import RandomForestClassifier
print "Running Model"
clfrf = RandomForestClassifier(200, n_jobs=-1, bootstrap=True)
clfrf.fit(X_train, y_train)
Ejemplo n.º 13
0
# GROUPING AND CORRELATION
print(df.groupby('stars').mean().head(5))
print(df.groupby('stars').mean().head(5).corr())

# sns.heatmap(df.groupby('stars').mean().head(5).corr(), annot=True)
# plt.show()

# CREATE YELP DF ONLY FOR 1 AND 5 STAR REVIEWS
yelp_class = df[(df['stars'] == 1) | (df['stars'] == 5)]

X = yelp_class['text']
y = yelp_class['stars']

# CREATE COUNT VECTORIZER AND FIT TO X
cv = CV().fit(X)

# OVERWRITE X WITH TRANSFORM
X = cv.transform(X)

# TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64)

# CREATE NAIVE BAYES OBJECT AND FIT
nb = MNB().fit(X_train, y_train)
pred = nb.predict(X_test)

print(CR(y_test, pred))

# PIPELINE
pipe = Pipeline([('CV', CV()), ('TFIDF', TT()), ("BAYES", MNB())])
Ejemplo n.º 14
0
# demonstrates ways to count unique words in Python
from collections import Counter

import nltk
from sklearn.feature_extraction.text import CountVectorizer as CV

text = 'ah list of ah words'
t = text.split()
c = Counter(t)
# get unique words
c.keys()
c.most_common(10)

fd = nltk.FreqDist(t)
# unique words
fd.keys()
# you can plot the distribution easily
fd.plot()
# get words that occur at least a certain number of times
more_than_once = [(f, c) for f, c in fd.items() if c > 1]
fd.most_common(10)

# this method is more useful for multiple documents
vec = CV()
res = vec.fit_transform([text])
# same result as counter/FreqDist here
vec.vocabulary_
# get unique words
vec.vocabulary_.keys()
Ejemplo n.º 15
0
def count_init(s140_train):
    # Fit the corpus to the CountVectorizer (bag of words)
    eng_words = np.genfromtxt("data/corpus.txt", dtype="str")
    CVec = CV()
    CVec.fit(eng_words)
    return CVec
Ejemplo n.º 16
0
        review["overall"]
        reviews.append(Review(review["reviewText"], review["overall"]))

# Split data

from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=.33, random_state=42)
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

# Bags of words vectorization

from sklearn.feature_extraction.text import CountVectorizer as CV
vectorizer = CV()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)  #just transform for test data

# Classification
# there are many different classification methods

from sklearn import svm
clf_svm = svm.SVC(kernel="linear")  # linear svm
clf_svm.fit(train_x_vectors, train_y)

from sklearn.tree import DecisionTreeClassifier  # decision tree
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

from sklearn.naive_bayes import GaussianNB  #Naive Bayes
Ejemplo n.º 17
0
        xTest = []
        yTest = []

        for J in range(5):
            if J != I:
                xTrain.extend(XkFold[J])
                yTrain.extend(YkFold[J])
            else:
                testIndex = J

        xTrain = transform(xTrain)
        xTest = transform(XkFold[testIndex])

        assert len(xTrain) == len(yTrain)
        xTrainNew, yTrainNew = balancedTrain(xTrain, yTrain, 'CV')
        counterList.append(CV(ngram_range=(2, 2), min_df=5))
        trainVector = counterList[-1].fit_transform(xTrainNew)
        testVector = counterList[-1].transform(xTest)

        selectList.append(SelectKBest(chi2, k=min(10000,
                                                  trainVector.shape[1])))

        trainVector = selectList[-1].fit_transform(trainVector, yTrainNew)
        testVector = selectList[-1].transform(testVector)

        mreTotal.append(0)
        for J in clfOption:
            J.fit(trainVector, yTrainNew)
            prediction = J.predict(testVector)
            mreTotal[-1] += mrc(prediction, YkFold[testIndex])
Ejemplo n.º 18
0
def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.

    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print(
            "Please also pass a count vectorizer with the vocabulary parameter set."
        )
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error='ignore',
                min_df=10,
                max_df=.5,
                ngram_range=(1, ngram),
                binary=False,
                max_features=15000)
    counts_mat = cv.fit_transform(l1 + l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis=0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis=0)
    a0 = np.sum(priors)
    n1 = 1. * np.sum(count_matrix[0, :])
    n2 = 1. * np.sum(count_matrix[1, :])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0, i] + priors[i]) /
                       (n1 + a0 - count_matrix[0, i] - priors[i]))
        term2 = np.log((count_matrix[1, i] + priors[i]) /
                       (n2 + a0 - count_matrix[1, i] - priors[i]))
        delta = term1 - term2
        #compute variance on delta
        var = 1. / (count_matrix[0, i] +
                    priors[i]) + 1. / (count_matrix[1, i] + priors[i])
        #store final score
        z_scores[i] = delta / np.sqrt(var)
    index_to_term = {v: k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
    return return_list
'''finding features and vectorising segments'''
'''#########################################'''
'''
model = model with feature words having atleast frequency = 3 = 11000
vec_seg(sparse matrix) = [ [0,0,1,1,0,1,1,1,1,0,0,0,0,1,1,... number of feature words=11000]
						 [0,0,1,0,0,1,1,0,1,0,0,1,1,0,0,... whether word present or not]
						 ....
						 number of segments
		  				 ]
number_f_w = number of feature words extracted from merged data
'''

model = CV(binary=True,
           min_df=3,
           ngram_range=(1, n_gram_size),
           max_features=20000,
           lowercase=lowercase,
           tokenizer=tokenizer,
           token_pattern=token_pattern)
model = model.fit(merged_data)
vec_seg = model.transform(segments)
number_f_w = len(model.vocabulary_)
vec_seg = vec_seg.toarray()
max_features = min(max_features, number_f_w)
print "number of feature words:", number_f_w
print "STEP 2 done"
'''######'''
'''Step 2'''
'''######'''
'''############################################'''
'''#################Step 3#####################'''
ps = PorterStemmer()
clnd_msgs = []
#wnl = WordNetLemmatizer()
for i in range(len(messages)):
    temp_msg = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    temp_msg = temp_msg.lower()
    temp_msg = temp_msg.split()
    temp_msg = [ps.stem(word) for word in temp_msg if word not in set(stopwords.words('english'))]
#    temp_msg = [wnl.lemmatize(word) for word in temp_msg if word not in set(stopwords.words('english'))]
    temp_msg = ' '.join(temp_msg)
    clnd_msgs.append(temp_msg)

#Creating BagOfWords model
from sklearn.feature_extraction.text import CountVectorizer as CV
cv = CV(max_features = 5000) #selecting random 5k feaures or columns or words
X = cv.fit_transform(clnd_msgs).toarray()

'''
creating TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer as TV
tv = TV(max_features = 5000)#selecting random 5k feaures or columns or words
X = tv.fit_transform(clnd_msgs).toarray()
'''

#Output data
Y = pd.get_dummies(messages['label'])
Y = Y.iloc[:,1].values

#Train Test split
from sklearn.model_selection import train_test_split
Ejemplo n.º 21
0
            continue
        else:
            if len(word) > 1:
                features.append(ps.stem(word))
    return features, sentiment


with open(opath + "sentiment.txt", encoding='cp1252') as f:
    train_x = []
    train_y = []
    for line in f:
        x, y = getFeatures(line)
        train_x.append(' '.join(x))
        train_y.append(1.0 if y == '+1' else 0.0)

    cv = CV()
    train_x_cv = cv.fit_transform(train_x)
    model = LogisticRegression()
    model.fit(train_x_cv, train_y)

with open(opath + "sentiment.txt", encoding='cp1252') as f:
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for text in f:
        x = cv.transform([text])
        y = model.predict(x)
        y_p = model.predict_proba(x)

        label = text[:2]
Ejemplo n.º 22
0
te_data_y = tr_data.target
target_name = tr_data.target_names
from sklearn.svm import SVC
# def feature_work(data=None,vb=None,stop_words=None,max_df=1):
#     cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb)
#     #print(cv.vocabulary)
#     tr_vb=cv.vocabulary_
#
#     tf=TF()
#     tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值
#     print('0:',cv.fit_transform(data).shape)
#     print('1:', tf_idf.shape)
#     #word=cv.get_feature_names()#词文本的关键字
#     #weight=tf_idf.toarray()
#     return tr_vb,tf_idf
cv = CV(stop_words='english', max_df=0.8)
tf = TF()
tr_idf = tf.fit_transform(cv.fit_transform(tr_data_x))  #词频和tfidf值
print('0:', cv.fit_transform(tr_data_x).shape)
te_idf = tf.fit_transform(cv.fit_transform(te_data_x))  #词频和tfidf值
print('1:', cv.fit_transform(te_data_x).shape)
#train feature tf_tr是训练输入从tr_data_x处理得来,tr_data_y训练目标没有修改
#tr_vb,tf_tr=feature_work(tr_data_x,stop_words='english',max_df=0.5)
#test feature
#te_vb,tf_te=feature_work(te_data_x,vb=tr_vb)


def getaccuracy(model=None, x=None, y_test=None, tar_name=None):

    y_pre = model.predict(x)
    print(classification_report(y_test, y_pre, target_names=tar_name))
Ejemplo n.º 23
0
# CLEAN UP MESS
clean_mess = [word for word in nopunc.split() if word.lower() not in sw]
print(clean_mess)

# CREATE FUNCTION FOR CLEANING UP MESSAGES
def text_process(mess):
	np = ''.join([c for c in mess if c not in string.punctuation])
	return [word.lower() for word in np.split() if word.lower() not in sw]


# EXAMPLE CLEAN OF HEAD OF MESSAGES
print(df['msg'].head(5).apply(text_process))

# INSTANTIATE COUNT VECTORIZER AND FIT
bow_transformer = CV().fit(df['msg'])

# LENGTH OF VOCABULARY IN VECTORIZER
print(len(bow_transformer.vocabulary_))

# GET MESSAGE NUMBER 4 UNANLTERED
mess4 = df['msg'].iloc[3]
print(mess4)

# GET BAG OF WORDS FROM THE FITTED TRANSFORMER (VECTORIZER)
bow4 = bow_transformer.transform([mess4])
print(bow4)

print(bow4.shape)

# SEE THE WORDS WHICH WHERE IN THE MESSAGE TWICE (THESE RESULTS ARE DIFFERENT FROM THE LECTURE - I SUSPECT THAT SCIKITLEARN WAS SIMPLY UPDATED)
Ejemplo n.º 24
0
そのためには、記事を読むこと、記事を書くことを通して、
読む側・書く側それぞれがお互いに関わり合って、
再利用性・汎用性の高い情報を育てていきましょう。
'''

txt = '''
Python is an interpreted high-level programming language for general-purpose programming.
Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, and a syntax that allows programmers to express concepts in fewer lines of code,notably using significant whitespace.
It provides constructs that enable clear programming on both small and large scales.
Python features a dynamic type system and automatic memory management.
It supports multiple programming paradigms, including object-oriented, imperative, functional and procedural, and has a large and comprehensive standard library.
Python interpreters are available for many operating systems.
CPython, the reference implementation of Python, is open source software and has a community-based development model, as do nearly all of its variant implementations.
CPython is managed by the non-profit Python Software Foundation.
'''

source_list = []
for x in txt.split('\n'):
    if x is not '':
        source_list.append(x)


# print(source_list)

# cv = CV()
cv = CV(stop_words="english", ngram_range=(1, 2))
matrix = cv.fit_transform(source_list)
print(matrix)

print(cv.get_feature_names())
from nltk.stem.porter import PorterStemmer as PS
corpus = []

for i in range(0, 1000):
    review = re.sub('[^A-Za-z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PS()
    review = [ps.stem(word) for word in review if word
              not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer as CV
cv = CV(max_features=1500)  # Keeping only the 1500 most used words
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
                                                    random_state=0)

# Using Naives Bayes model
# Fitting the Classifier to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
Ejemplo n.º 26
0
print(re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', ))

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

import re
# s = 'RT @bugwannostra: @Louuu_ thx		#FFFFs People power -_-      works	❤signing…		https://t.co/pl2bquE5Az'
s = 'RT @bugwannostra: @Louuu_432 thx 6.3 #FF-FFs, People power -_-  https:/ 2.34  234w.orks	❤signing… ht.tp:   https:'
re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', s)

from sklearn.feature_extraction.text import CountVectorizer as CV

cv = CV(analyzer='word',
        token_pattern=r'([a-zA-Z_-]+|\d+\.\d+|\d+)',
        stop_words=stop_words,
        max_df=0.8,
        min_df=1e-5)
# cv.fit_transform(textarr)

# 这一行的效果和直接运行cProfile.run("foo()")的显示效果是一样的
# p.strip_dirs().sort_stats(-1).print_stats()
# strip_dirs():从所有模块名中去掉无关的路径信息
# sort_stats():把打印信息按照标准的module/name/line字符串进行排序
# print_stats():打印出所有分析信息

# 按照在一个函数中累积的运行时间进行排序
# print_stats(3):只打印前3行函数的信息,参数还可为小数,表示前百分之几的函数信息

# python3.5 -m cProfile -o res event_extractor.py
import pstats
Ejemplo n.º 27
0
# combine train and test sets
train_set = train_neg_docs + train_pos_docs
test_set = test_neg_docs + test_pos_docs

# combine pos and neg sets
pos_rev = clean_train_pos + clean_test_pos
neg_rev = clean_train_neg + clean_test_neg

# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer as CV
import numpy as np
import pandas as pd 

# get frequency counts
word_frequency_vectorizer = CV(binary = False)

# apply it to positive reviews
pos_freq_dtm = word_frequency_vectorizer.fit_transform(pos_rev)

# convert dtm to df
pos_freq_df = pd.DataFrame(pos_freq_dtm.toarray(), columns = word_frequency_vectorizer.get_feature_names())
pos_freq_df.shape


# get the most frequent words in pos and neg reviews
from nltk import FreqDist
from nltk.corpus import stopwords

all_pos_words = []
filt_pos_words = []
Ejemplo n.º 28
0
def bayes_compare_language(l1, l2, ngram_range=(1, 3), prior=.01, cv=None, counts_mat=None):
    """
    Parameters
    ----------
    l1, l2 : Iterable[str]
        list of strings from each language sample

    ngram_range : Tuple[int, int], default=(1,3)
        an integer describing up to what n-gram you want to consider
        (1 is unigrams, 2 is bigrams + unigrams, etc). Ignored if a
        custom CountVectorizer is passed.

    prior : Union[float, array[float]]
        a float describing a uniform prior, or a vector describing a
        prior over vocabulary items. If you're using a predefined
        vocabulary, make sure to specify that when you make your
        CountVectorizer object.

    cv : Optional[sklearn.feature_extraction.text.CountVectorizer], default=None
        Pass this if you have pre-defined vocabulary. If None, by
        default an sklearn CV with min_df=10, max_df=.5, and
        ngram_range=(1,3) with max 15000 features.

    counts_mat : Optional[np.ndarray[len(l1 + l2), k]], default=None
        Counts matrix with size equal to length of `l1 + l2` (must
        also be in that order) and with k features. Pass this if
        you already have a dataset vectorized. If given, then the
        vectorizer must also be passed to `cv`.

    Returns
    -------
    z_scores : pd.DataFrame
        A pandas DataFrame of shape (|Vocab|, 2) with (n-gram, z-score) pairs.

    array[array[float]]:
        A 2-row matrix of counts of terms in l1 and l2 respectively.
    """
    if cv is None and type(prior) is not float:
        raise ValueError("If using a non-uniform prior, please also pass a count "
                         "vectorizer with the vocabulary parameter set.")
    if counts_mat is not None:
        assert isinstance(cv, CV)

    # clean the text
    if counts_mat is None:
        logger.info('Basic cleaning of the text')
        l1 = [basic_sanitize(l) for l in l1]
        l2 = [basic_sanitize(l) for l in l2]

    # initialize count vectorizer
    if counts_mat is None:
        logger.info('Vectorizing documents with CountVectorizer')
        if cv is None:
            cv = CV(decode_error='ignore', min_df=10, max_df=.5,
                    ngram_range=ngram_range, binary=False,
                    max_features=15000)
        counts_mat = cv.fit_transform(l1 + l2).toarray()
    vocab_size = len(cv.vocabulary_)
    logger.info("Vocab size is {}".format(vocab_size))

    # Now sum over languages...
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis=0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis=0)
    a0 = np.sum(priors)
    n1 = 1. * np.sum(count_matrix[0,:])
    n2 = 1. * np.sum(count_matrix[1,:])

    logger.info("Comparing language...")
    for i in range(vocab_size):
        # compute delta
        term1 = np.log((count_matrix[0, i] + priors[i]) / (n1 + a0 - count_matrix[0, i] - priors[i]))
        term2 = np.log((count_matrix[1, i] + priors[i]) / (n2 + a0 - count_matrix[1, i] - priors[i]))        
        delta = term1 - term2

        # compute variance on delta
        var = 1. / (count_matrix[0, i] + priors[i]) + 1. / (count_matrix[1, i] + priors[i])

        # store final score
        z_scores[i] = delta / np.sqrt(var)

    index_to_term = {v:k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    z_scores = pd.DataFrame([(index_to_term[i], z_scores[i]) for i in sorted_indices], columns=['term', 'z-score'])
    logger.info("Done")

    return z_scores, count_matrix
Ejemplo n.º 29
0
        if settings.ALGORITHM:
            print("Data berhasil difetch")
            print("Data Train Unknown: %d Data Train: %d Data Test: %d" %
                  (len(train_data_unknown), len(train_data), len(test_data)))

        filtered = []
        for i in range(0, len(train_data_unknown), len(train_data)):
            X = [j[3] for j in train_data]
            y = [int(1) for j in train_data]
            to_evaluate = [
                j[3] for j in train_data_unknown[i:i + len(train_data)]
            ]
            X += to_evaluate
            y += [int(0) for j in to_evaluate]

            counter = CV()
            vector = counter.fit_transform(clean_text(X))
            to_evaluate_vector = counter.transform(clean_text(to_evaluate))

            bayes = NB()
            bayes.fit(vector, y)
            predict = bayes.predict_proba(to_evaluate_vector)

            for j in range(len(predict)):
                if predict[j][1] > 0.9:
                    filtered.append(train_data_unknown[i + j])

        if settings.DEBUG_MODE:
            print("Data berhasil difilter")
            print("Data Filtered: ", len(filtered))
Ejemplo n.º 30
0
    # object runs stem() func
    # set() will make bigger texts run faster
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    # join the result into one string, by space
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model:
# After the "review" string, create cols for each word, and mark 1 of that word occurs in a string
from sklearn.feature_extraction.text import CountVectorizer as CV
# also removes any stop words, it could do above cleaning as well
# would be 1565 (all the words from the corpus), we reduce to most sparse 1500
cv = CV(max_features=1500)
# the sparse matrix and make it a matrix)
X = cv.fit_transform(corpus).toarray()
# And what is the dependend variable, need our 1 yay, or 0 nay
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()