Python preprocess 예제들, Preprocessing.preprocess Python 예제들

예제 #1

0

파일 보기

def neural_network_classification(metrics):

    training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(
        metrics)

    activation = "relu"
    model = Sequential()
    model.add(
        Dense(len(metrics) * 2,
              activation=activation,
              kernel_regularizer=regularizers.l2(0.1),
              input_shape=(len(metrics), )))
    model.add(
        Dense(30,
              activation=activation,
              kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss="binary_crossentropy")
    model.fit(training_data,
              training_labels,
              epochs=30,
              batch_size=300,
              validation_data=(test_data, test_labels),
              verbose=0)

    data = np.concatenate((training_data, test_data))
    labels = np.concatenate((training_labels, test_labels))

    predictions = model.predict(data)
    predictions = np.squeeze(predictions, axis=1)

    return data, predictions, labels, categories, mappings

예제 #2

0

파일 보기

def infer(model, fnImg):
    "recognize text in image provided by file path"
    img = preprocess(cv2.imread(fnImg, cv2.IMREAD_GRAYSCALE), Model.imgSize)
    batch = BatchImages(None, [img])
    (recognized, probability) = model.inferBatch(batch, True)
    print('Recognized:', '"' + recognized[0] + '"')
    print('Probability:', probability[0])

예제 #3

0

파일 보기

파일: Compas_NN.py 프로젝트: NitinVishalKulkarni/FairnessInML

def neural_network_classification(metrics_):
    """This function takes as input  a list of metrics and uses a neural network to perform the classification.
    :param metrics_: List containing the metrics that we want to use for classification."""

    training_data, training_labels, test_data, test_labels, categories_, mappings_ = preprocess(
        metrics_)

    activation = "relu"
    model = Sequential()
    model.add(
        Dense(len(metrics) * 2,
              activation=activation,
              kernel_regularizer=regularizers.l2(0.1),
              input_shape=(len(metrics))))
    model.add(
        Dense(30,
              activation=activation,
              kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss="binary_crossentropy")
    model.fit(training_data,
              training_labels,
              epochs=30,
              batch_size=300,
              validation_data=(test_data, test_labels),
              verbose=1)

    data_ = np.concatenate((training_data, test_data))
    labels_ = np.concatenate((training_labels, test_labels))

    predictions_ = model.predict(data_)
    predictions_ = np.squeeze(predictions_, axis=1)
    print("End")
    return data, predictions_, labels_, categories_, mappings_

예제 #4

0

파일 보기

파일: VoteClassifier.py 프로젝트: pcruiher08/SentimentAnalysis

 def predict_SKLearn(self, vectoriser, model, text):
     # Predict the sentiment
     preprocessed_text = ' '
     preprocessed, _                = preprocess([text])
     preprocessed_text = ' '.join([str(word) for word in preprocessed])
     preprocessed_text = [preprocessed_text]
     textdata          = vectoriser.transform(preprocessed_text)
     prediction        = model.predict(textdata)
     return prediction[0]

예제 #5

0

파일 보기

    def nextBatch(self):
        rangeOfBatch = range(self.currIdx, self.currIdx + self.batchSize)
        trueText = [self.samples[i].trueText for i in rangeOfBatch]

        imgs = [
            preprocess(cv2.imread(self.samples[i].filePath, cv2.IMREAD_GRAYSCALE), self.imgSize, self.dataAugmentation) for i in rangeOfBatch]

        self.currIdx += self.batchSize
        return BatchImages(trueText, imgs)

예제 #6

0

파일 보기

def predict(vectoriser, model, text):
    # Predict the sentiment
    preprocessed_text = ' '
    preprocessed, _ = preprocess(text)
    preprocessed_text = ' '.join([str(word) for word in preprocessed[0]])
    textdata = vectoriser.transform(word_tokenize(preprocessed_text))
    prediction = model.predict(textdata)

    print(prediction[0])

예제 #7

0

파일 보기

def SVM_classification(metrics):
    training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(
        metrics, recalculate=False, causal=False)

    np.random.seed(42)
    SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=5000)
    SVR.fit(training_data, training_labels)

    data = np.concatenate((training_data, test_data))
    labels = np.concatenate((training_labels, test_labels))

    predictions = SVR.predict(data)
    return data, predictions, labels, categories, mappings

예제 #8

0

파일 보기

def naive_bayes_classification(metrics):
    training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(
        metrics)

    NBC = MultinomialNB()
    NBC.fit(training_data, training_labels)

    data = np.concatenate((training_data, test_data))
    labels = np.concatenate((training_labels, test_labels))

    class_predictions = NBC.predict_proba(data)
    predictions = []

    for i in range(len(labels)):
        predictions.append(class_predictions[i][1])

    return data, predictions, labels, categories, mappings

예제 #9

0

파일 보기

파일: Compas_SVM.py 프로젝트: NitinVishalKulkarni/FairnessInML

def SVM_classification(metrics_):
    """This function takes as input  a list of metrics and uses a Support vector machine to perform the classification.
      :param metrics_: List containing the metrics that we want to use for classification."""

    training_data, training_labels, test_data, test_labels, categories, mappings = \
        preprocess(metrics_, recalculate=False, causal=False)

    np.random.seed(42)
    SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=5000)
    SVR.fit(training_data, training_labels)

    data = np.concatenate((training_data, test_data))
    labels = np.concatenate((training_labels, test_labels))

    predictions = SVR.predict(data)

    return data, predictions, labels, categories, mappings

예제 #10

0

파일 보기

파일: Compas_Naive_Bayes.py 프로젝트: NitinVishalKulkarni/FairnessInML

def naive_bayes_classification(metrics):
    """This function takes as input a list of metrics and performs the Naive Bayes classification.
    :param metrics: List containing the metrics that we want to use for classification."""

    training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(
        metrics)

    NBC = MultinomialNB()
    NBC.fit(training_data, training_labels)

    data = np.concatenate((training_data, test_data))
    labels = np.concatenate((training_labels, test_labels))

    class_predictions = NBC.predict_proba(data)
    predictions = []

    for i in range(len(labels)):
        predictions.append(class_predictions[i][1])

    return data, predictions, labels, categories, mappings

예제 #11

0

파일 보기

from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix,accuracy_score
import plotly.graph_objects as go
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt


# In[ ]:


# Import preprocess files
preprocessing = preprocess()


# In[ ]:


# Preprocess data to generate the data
def generateWordList(fileData): # Generate list of preprocessed words from file
    finalSet = preprocessing.checkEmail(fileData) # Check emails in the file and add them to wordlist
    fileData = preprocessing.removeEmail(fileData) # Remove emails 
    finalSet = finalSet + preprocessing.checkWebsite(fileData) # Check for website and decimal number and add them to wordlist
    fileData = preprocessing.removeWebsite(fileData) # Remove webiste and decimal numbers 
    fileData = preprocessing.contractionsExpand(fileData) # Expand the contracted word
    fileData = preprocessing.caseChange(fileData) # Change case of file to lower case
    fileData = preprocessing.removePunctuations(fileData) # Remove punctutation 
    wordList = preprocessing.wordSeperator(fileData) # Seperate words by space

예제 #12

0

파일 보기

파일: market_model.py 프로젝트: shreyajv26/CSE574_MachineLearningModel_onFairness

from sklearn import svm
from Preprocessing import preprocess
from Postprocessing import *
from utils import *

metrics = [
    "race", "sex", "age", 'c_charge_degree', 'priors_count', 'c_charge_desc'
]
training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(
    metrics)

SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=10000)
SVR.fit(training_data, training_labels)

training_class_predictions = SVR.predict(training_data)
training_predictions = []
test_class_predictions = SVR.predict(test_data)
test_predictions = []

for i in range(len(training_labels)):
    training_predictions.append(training_class_predictions[i])

for i in range(len(test_labels)):
    test_predictions.append(test_class_predictions[i])

training_race_cases = get_cases_by_metric(training_data, categories, "race",
                                          mappings, training_predictions,
                                          training_labels)
test_race_cases = get_cases_by_metric(test_data, categories, "race", mappings,
                                      test_predictions, test_labels)

예제 #13

0

파일 보기

from sklearn import svm
from Preprocessing import preprocess
from Report_Results import *
import numpy as np
from utils import *

metrics = ["sex", "age", 'race', 'c_charge_degree', 'priors_count', 'c_charge_desc'] # age_cat
training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(metrics, recalculate=False, causal=False)

def SVM_classification(metrics):

    np.random.seed(42)
    SVR = svm.LinearSVR(C=1.0/float(len(test_data)), max_iter=5000) # 5600
    SVR.fit(training_data, training_labels)

    #data = np.concatenate((training_data, test_data))
    #labels = np.concatenate((training_labels, test_labels))

    training_predictions = SVR.predict(training_data) # data
    testing_predictions = SVR.predict(test_data)
    return training_data, test_data, training_predictions, testing_predictions, training_labels, categories, mappings # labels

#######################################################################################################################

training_data, test_data, training_predictions, testing_predictions, labels, categories, mappings = SVM_classification(metrics)


training_race_cases = get_cases_by_metric(training_data, categories, "race", mappings, training_predictions, training_labels)
test_race_cases = get_cases_by_metric(test_data, categories, "race", mappings, testing_predictions, test_labels)

예제 #14

0

파일 보기

파일: EventDetection.py 프로젝트: alextsil/twitter-topic-detection-and-analysis

import pandas

from DB import db
from Preprocessing import preprocess

count = 0
db = db()

allTweets = db.getAll()

hashtagDatetimes = []
for tweet in allTweets:
    # Hashtags list
    terms_hash = [
        term for term in preprocess(tweet['text'].lower())
        if term.startswith('#')
    ]
    if '#kathygriffin' in terms_hash:
        hashtagDatetimes.append(tweet['created_at'])
    # if count == 60000:
    #     break
    count += 1
    print("\rLive number of processed tweets: " + str(count), end="")

print("\n")
print("length of occurence array : " + str(len(hashtagDatetimes)))

# a list of "1" to count the hashtags
ones = [1] * len(hashtagDatetimes)
# the index of the series
idx = pandas.DatetimeIndex(hashtagDatetimes)

예제 #15

0

파일 보기

from Preprocessing import preprocess, bigrams, trigrams

#       --- IMPORTING DATASET ---
# Using Sentiment140 dataset with 1.6 million tweets
# https://www.kaggle.com/kazanova/sentiment140/data
DATASET_COLUMNS = names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tweets_raw = pd.read_csv(r'.\dataset\tweets.csv', sep = ',', quotechar ='"',encoding='latin-1', names=DATASET_COLUMNS)
tweets_raw['sentiment'] = tweets_raw['sentiment'].replace(4,'pos')
tweets_raw['sentiment'] = tweets_raw['sentiment'].replace(0,'neg')
tweets_raw, label = list(tweets_raw['text']), list(tweets_raw['sentiment'])

neg_tweets_raw = tweets_raw[:799999]
pos_tweets_raw = tweets_raw[800000:]

t = time.time()
pos_tweets, neg_words_ngrams = preprocess(pos_tweets_raw)
neg_tweets, pos_words_ngrams = preprocess(neg_tweets_raw)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

while len(neg_tweets) > len(pos_tweets):
    del neg_tweets[0]
    del label[-1]
while len(pos_tweets) > len(neg_tweets):
    del pos_tweets[0]
    del label[0]

t = time.time()
bgram = bigrams(neg_words_ngrams,stopwords.words('english'))
tgram = trigrams(neg_words_ngrams,stopwords.words('english'))
print(f'Ngrams processing complete.')

예제 #16

0

파일 보기

from LSTM import lstm_model
from Ploting import plot
from Preprocessing import preprocess
from Trading_Algorithm import trading_algorithm, compute_earnings

csv_path = 'data/tickers_data/femeli-daily.csv'
history_points = 50

x_train, y_train, x_test, y_test, x_val, y_val, y_test_real, scale_back, tech_ind_train, tech_ind_test, tech_ind_val = preprocess(
    csv_path, history_points)
y_test_predicted, model = lstm_model(history_points, x_train, y_train, x_test,
                                     y_test, x_val, y_val, y_test_real,
                                     scale_back, tech_ind_train, tech_ind_test,
                                     tech_ind_val)
buys, sells = trading_algorithm(x_test, tech_ind_test, scale_back, model)
plot(y_test_real, y_test_predicted, buys, sells)
purchase_amt = 1000000000  # 100 million Toman
print(
    "{} Rials after trading over trading days of the test data will make profit {} Rials"
    .format(purchase_amt,
            compute_earnings(buys, sells, purchase_amt) - purchase_amt))

예제 #17

0

파일 보기

파일: Plotting.py 프로젝트: alextsil/twitter-topic-detection-and-analysis

from DB import db
from Preprocessing import preprocess, stop

count = 0
db = db()

allTweets = db.getAll()

count_all_hashtags = Counter()
count_all_terms = Counter()
dates_hashtag = []
for tweet in allTweets:
    tweetText = tweet['text'].lower()
    # Bigrams list
    termsWithoutStopwords = [
        term for term in preprocess(tweetText) if term not in stop
    ]
    # termsBigrams = bigrams(termsWithoutStopwords)

    # Hashtags list
    terms_hash = [
        term for term in preprocess(tweetText) if term.startswith('#')
    ]
    if '#marchfortruth' in terms_hash:
        dates_hashtag.append(tweet['created_at'])

    # Update the counter(s)
    count_all_terms.update(termsWithoutStopwords)
    count_all_hashtags.update(terms_hash)

    count += 1