Python clean 예제들, cleaning.clean Python 예제들

예제 #1

0

파일 보기

def getFlair():
    link = request.args.get('link')
    reddit = praw.Reddit(client_id='',
                         client_secret='',
                         user_agent='',
                         username='',
                         password='')
    submission = reddit.submission(url=link)

    posts = []
    submission.comments.replace_more(limit=None)
    commentList = ''
    for comment in submission.comments:
        commentList += " " + comment.body
    posts.append([
        submission.link_flair_text, submission.title, submission.id,
        submission.url, submission.created, commentList, submission.author
    ])
    data = pandas.DataFrame(posts,
                            columns=[
                                'link_flair_text', 'title', 'id', 'url',
                                'created', 'comments', 'author'
                            ])
    data[['title']] = data.apply(lambda x: cleaning.clean(x['title']), axis=1)
    data[['comments']] = data.apply(lambda x: cleaning.clean(x['comments']),
                                    axis=1)

    X = data['title'] + data['url'] + data['comments'] + data['id']

    loaded_model = pickle.load(open("log_reg_combined_model.sav", 'rb'))
    predicted_flair = loaded_model.predict(X)
    return str(predicted_flair)

예제 #2

0

파일 보기

파일: scrape.py 프로젝트: ambervoong/isextreme

def start(maxTerm, keyword, pbarData):
    print("Your search term is: " + keyword)
    DATA_PATH = './input_data/tweets.csv'
    FOLDER = './input_data'
    SAVE_PATH = './outputs/results/results.csv'
    # Remove any old tweets.csv
    try:
        os.remove('./input_data/tweets.csv')
        print("Removed tweets.csv")
    except OSError as e:
        logger.error("tweets.csv has not been created yet. " + str(e))

    # Scrape twitter for tweets and save it in input_data/tweets.csv
    print("Scraping...")
    scrape(
        maxTerm, keyword
    )  # fakeuseragent error always happens here; it does not affect function.
    sleep(2)  # Give time for the file to be created

    # Assert there is a new tweets.csv
    assert os.path.exists(DATA_PATH), "tweets.csv does not exist."
    print("Scraping finished. Translation being performed...")
    pbarData.incrementStage()

    # Translate tweet column.
    translate.translate(DATA_PATH, pbarData)
    print("Translation complete. Cleaning in progress...")
    # Clean the data
    cleaning.clean(DATA_PATH)

    # # for BERT model
    # print("Cleaning complete. Converting to tsv...")
    # # Change it into correctly formatted .tsv
    # # In case there is already an old dev.tsv
    # try:
    #        os.remove('./input_data/dev.tsv')
    #        print("Removed dev.tsv")
    # except OSError as e:
    #        pass
    #
    # dataprocessing.change_to_tsv(DATA_PATH, FOLDER)
    # print("Preprocessing complete. Processing...")
    # # Put it into BERT. >> Remember to make BERT save the predictions somewhere
    # predicted = bert_eval.predict(FOLDER)

    # LinearSVC model
    print("Cleaning complete. Predicting...")
    processed = SVCmodel.generate(DATA_PATH)
    predicted = SVCmodel.predict(processed, DATA_PATH, SAVE_PATH)
    print("View results at ./outputs/results/results.csv")
    return predicted

예제 #3

0

파일 보기

파일: htmlparser.py 프로젝트: davidferguson/mactutor-converter

def parse(bio, name, extras=[], translations=[], paragraphs=False, url_context='/'):
    # check we've got a string here
    assert type(bio) == str

    # run the new and improved cleaning
    bio = cleaning.clean(bio, name)

    # remove html special characters
    bio = html.unescape(bio)

    # convert formula to katex
    regex = re.compile(r'\\(?P<math>.+?)\\\\', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, mathreplace, bio)

    # convert special sets
    regex = re.compile(r'`(.)`', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, r'<latex>\mathbb{\1}</latex>', bio)

    # check that the link location is correct
    regex = re.compile(r'<a\s+href\s*=\s*[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>')
    bio = re.sub(regex, lambda match: urlreplace(match, url_context), bio)

    # convert m links to w links
    regex = re.compile(r'<w(?:\s+(?P<name>.+?))?>(?P<text>.*?)\<\/w\>', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, mreplace, bio)

    # convert <font color=...>...</font> to f+, etc.
    regex = re.compile(r'<font color\s*=\s*[\'"]?(?P<color>\w+)[\'"]?\s*>(?P<text>.*?)</font>', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, fontreplace, bio)

    # check that the image location is correct
    regex = re.compile(r'<d\s+(?P<content>.+?)>', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, lambda match: dreplace(match, url_context, name), bio)

    # convert to normal diagrams
    regex = re.compile(r'(?P<tag><img\s+.+?>)', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, lambda match: imgreplace(match, url_context), bio)

    # remove manual italics from numbers/brackets
    regex = re.compile(r'<\/i>([\d\[\]\(\)]+)<i>', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, r'\1', bio)

    # remove size from glossary entry
    regex = re.compile(r'<g\s+(?P<glossary>.+?)(?:,\d+(?:,\d+)?)?>(?P<text>.*?)\<\/g\>', re.MULTILINE | re.DOTALL)
    bio = re.sub(regex, r'<g \1>\2</g>', bio)

    # convert symbolgifs to unicode
    bio = symbolreplace.symbols_to_unicode(bio)

    # we can do smart quotes here too, actually
    bio = bio.replace('’', '"')
    bio = bio.replace('‘', '"')
    bio = bio.replace("“", "'")
    bio = bio.replace("”", "'")

    # remove any print-only tags
    bio = bio.replace('<pr>', '')
    bio = bio.replace('</pr>', '')

    return bio

예제 #4

0

파일 보기

파일: datasets.py 프로젝트: kembo91/sal-mac

 def __init__(self, labels, root, transform, salient=False, n_negatives=5):
     self.root = root
     self.transforms = transform
     self.df = clean(root, labels)
     self.data = None
     self.n_negatives = n_negatives
     self.salient = salient

예제 #5

0

파일 보기

def nn_results():
    global results
    if request.method == 'POST':
        file = request.files['file']
        df = pandas.read_csv(file)
        #set months and clean the file
        months = df['month']
        clean_df = clean(df).values

        loaded_model = pickle.load(open("nn_model.pkl", "rb"))
        #use predict_classes not predict
        print("model loaded")
        ynew = loaded_model.predict_classes(clean_df)
        print("prediction step ")

        #call getResults function or module : Returns json data for charts
        # create var results or data for getResults()

        results = getResults(ynew, months)
        print("results : ", results)

        #df = pandas.DataFrame(nn_results)
        #dlist = df.values.list()
        #return render_template("examples/exp.html", data=results )
        return jsonify({'results': results})
    else:
        print('GET the problem')
        return jsonify({'results': results})

예제 #6

0

파일 보기

파일: message_analysis.py 프로젝트: jayantrane/Train-Chat-ML-mIndicator

def analyze_message(value):
    col_names = [
        'Station name', 'Train name', 'Category', 'Platform number', 'Is spam',
        'If delay'
    ]
    output = pd.DataFrame(columns=col_names)
    #print(value)
    a, b, d, t = cleaning.clean(value)
    c, e, f = np.nan, np.nan, np.nan
    tex = pd.Series(value)
    numtext = tokenize.texts_to_matrix(tex)

    ee = fit_model1.predict(numtext)
    index = np.argmax(ee)
    if index == 0:
        e = False
    else:
        e = True
    if t is not np.nan:
        predicted = fit_model2.predict(t)
        index = np.argmax(predicted)
        c = encoder.inverse_transform([index])
        f = cleaning.get_delay_time(c, value)
        #print(encoder.inverse_transform([index]))
    output = output.append(pd.Series([a, b, c, d, e, f], index=col_names),
                           ignore_index=True)
    print(tabulate(output, headers=col_names, tablefmt='psql'))

예제 #7

0

파일 보기

def accept_generic_model(name=None):
    global results
    if request.method == 'POST':
        file = request.files['file']
        df = pandas.read_csv(file)
        #generic cleaning
        clean_df = clean(df).values
        #Build neural network // parameters ??
        nn = build_nn()
        #use predict_classes not predict

        print("results : ", results)
        return jsonify({'results': results})
    else:
        print('GET the problem')
        return jsonify({'results': results})

예제 #8

0

파일 보기

파일: create_training_data.py 프로젝트: sedrickkeh/bert-personality-classification

def create_files(lines):
    for line in lines:
        text = line[1]
        text = clean(text)

        label = line[0]
        label = re.sub("[^a-zA-Z]", '', label)

        if (len(label) > 4):
            continue

        directory_name = 'data/input/' + label + '/'
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
        f = open(directory_name + "train.txt", "a+")
        f.write(text)

예제 #9

0

파일 보기

def train():
    #Loads the data from the local storage
    synopses = []
    for filename in os.listdir('cnn-stories'):
        with open('cnn-stories/' + filename, 'r') as infile:
            synopses.append(infile.read())

    #Cleans the data
    corpus, dictionary = clean(synopses)

    #Saves the model and the dictionary in local storage
    corpora.Dictionary.save(dictionary, 'dictionary.dict')
    lda = models.LdaModel(corpus,
                          num_topics=10,
                          id2word=dictionary,
                          update_every=5,
                          chunksize=10000,
                          passes=100)
    lda.save('lda.model')

예제 #10

0

파일 보기

def normalize(motion, translate='', rotate='', scale='', clean=True):
    """!
    Apply normalization to a motion.

    The input motion is not modified.

    @param motion numpy.array: The motion to normalize
    @param translate: The normalization for translating the motions
    @param rotate: The normalization for rotating the motions
    @param scale: The normalization for scaling the motions
    @param cleaning: Remove duplicate points and large jumps, default true
    @return: The normalized motion and the normalization parameters
    """
    out = motion
    translationRef = translation.translate(out[:, 1:4], translate)
    rotationRef = rotation.rotate(out[:, 1:8], rotate)
    scalingRef = scaling.scale(out[:, 1:4], scale)
    out, removedPoints = cleaning.clean(motion)
    return out, translationRef, rotationRef, scalingRef

예제 #11

0

파일 보기

파일: data_processor.py 프로젝트: sedrickkeh/bert-personality-classification

    def create_examples(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if (i == 0): continue
            id_num = "%s-%s" % (set_type, i)
            text = line[1]
            text = clean(text)

            label = line[0]
            label = re.sub("[^a-zA-Z]", '', label)
            label = label.lower()
            if (len(label) > 4): continue
            
            if (self.mode == "E/I" or self.mode == "I/E"): label = label[0]
            elif (self.mode == "N/S" or self.mode == "S/N"): label = label[1]
            elif (self.mode == "T/F" or self.mode == "F/T"): label = label[2]
            elif (self.mode == "J/P" or self.mode == "P/J"): label = label[3]

            examples.append(InputExample(guid=id_num, text=text, label=label))
        return examples

예제 #12

0

파일 보기

reddit = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='',
                     username='',
                     password='')
submission = reddit.submission(url="link_you_want_to_check")

posts = []
submission.comments.replace_more(limit=None)
commentList = ''
for comment in submission.comments:
    commentList += " " + comment.body
posts.append([
    submission.link_flair_text, submission.title, submission.id,
    submission.url, submission.created, commentList, submission.author
])
data = pandas.DataFrame(posts,
                        columns=[
                            'link_flair_text', 'title', 'id', 'url', 'created',
                            'comments', 'author'
                        ])
data[['title']] = data.apply(lambda x: cleaning.clean(x['title']), axis=1)
data[['comments']] = data.apply(lambda x: cleaning.clean(x['comments']),
                                axis=1)

X = data['title'] + data['url'] + data['comments'] + data['id']

loaded_model = pickle.load(open("model.sav", 'rb'))
predicted_flair = loaded_model.predict(X)
print(predicted_flair, submission.link_flair_text)

예제 #13

0

파일 보기

파일: prediction.py 프로젝트: alexiskulash/ia-caucus-sentiment

    target_column = vectorized_data['Number of Votes']
    predictor_columns = vectorized_data.drop('Number of Votes', 1)
    vector_columns = vectorized_data[vector_headers]
    
    
    vectorized_data.reindex(np.random.permutation(vectorized_data.index))
    NUM_ROWS = vectorized_data.shape[0]
    NUM_TEST = int(NUM_ROWS*.15)
    
    train_data = vectorized_data[NUM_TEST:]
    train_target = train_data['Number of Votes']
    train_data = train_data[vector_header]

    test_data = vectorized_data[:NUM_TEST]
    test_target = test_data['Number of Votes']
    test_data = test_data[vector_header]
    
    #(train_data, test_data, train_target, test_target) =  ms.train_test_split(predictor_columns, target_column, test_size = 0.15)    
    
    classifier = RandomForestClassifier(n_estimators=10)
    classifier = classifier.fit(train_data[vector_headers], train_target)
    results = classifier.predict(test_data[vector_headers])
    
    output = pd.DataFrame(data={"Candidate":test_data['Candidate'], "County":test_data['County'], "Estimated Votes":results, "Actual Votes":test_target})    
    return output

cleaned_tweets = clean()
vectorized_data = vectorize(cleaned_tweets)
results = prediction(vectorized_data)
print(results)

예제 #14

0

파일 보기

import cleaning as cl
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

df_train = pd.read_csv('datasets/train.csv', encoding = "ISO-8859-1")
df_test = pd.read_csv('datasets/test.csv', encoding = "ISO-8859-1")

address =  pd.read_csv('datasets/addresses.csv')
latlons = pd.read_csv('datasets/latlons.csv')

X_train, X_test, y_train = cl.clean(df_train, df_test, address, latlons)

X_train.info()

X_train.shape
X_test.shape
y_train.shape

grid_values = {'learning_rate': [0.01, 0.1, 1], 'max_depth': [3,5]}
clf = GradientBoostingClassifier(random_state = 0)
grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)

grid_clf_auc.best_score_

grid_values2 = {'n_estimators': [5, 10, 20], 'max_features': [2,3,4], 'max_depth': [3,5]}

예제 #15

0

파일 보기

파일: model.py 프로젝트: jayantrane/Train-Chat-ML-mIndicator

modelFileLoad2 = open('models/model_m1', 'rb')
encoder = LabelEncoder()
encoder.classes_ = np.load('labelencoder/encoder_m1.npy')

fit_model1 = pickle.load(modelFileLoad1)
fit_model2 = pickle.load(modelFileLoad2)

col_names = [
    'Message', 'Station name', 'Train name', 'Category', 'Platform number',
    'Is spam', 'If delay'
]
output = pd.DataFrame(columns=col_names)

for idx, value in enumerate(train.m):
    #print(value)
    a, b, d, t = cleaning.clean(value)
    c, e, f = np.nan, np.nan, np.nan
    tex = pd.Series(value)
    numtext = tokenize.texts_to_matrix(tex)

    ee = fit_model1.predict(numtext)
    index = np.argmax(ee)
    if index == 0:
        e = False
    else:
        e = True
    if t is not np.nan:
        predicted = fit_model2.predict(t)
        index = np.argmax(predicted)
        c = encoder.inverse_transform([index])
        f = cleaning.get_delay_time(c, value)

예제 #16

0

파일 보기

파일: cleaning_text.py 프로젝트: wiemboudabous/Question-Pair-Similarity

def apply_clean(text):
    cleaned = clean_text(text)
    cleaned = clean(cleaned)
    cleaned = stemming(cleaned)
    cleaned = remove_lemmatization(cleaned)
    return cleaned

예제 #17

0

파일 보기

파일: main.py 프로젝트: aryanikhil/Stock-Price-Prediction

def main(Args):
    '''
    Main function for stock price prediction
    :param args: arguments acquired from command lines(refer to ParseArgs() for list of args)
    '''

    stockFilePath = Args.stockfilepath
    newsFilePath = Args.newsfilepath
    nCpuCores = Args.ncpucores
    testStockFilePath = Args.teststockfilepath
    testNewsFilePath = Args.testnewsfilepath
    modelTimeSeries = Args.modeltimeseries
    modelNews = Args.modelnews

    Logger.debug(
        "StockDataPath: {}, NewsDataPath: {}, NCpuCores: {}, TestStockFilePath: {}, TestNewsFilePath: {}"
        .format(stockFilePath, newsFilePath, nCpuCores, testStockFilePath,
                testNewsFilePath))

    #Time Series Analysis
    dataFrame = pd.read_csv(stockFilePath, parse_dates=True, index_col="date")
    testDataFrame = pd.read_csv(testStockFilePath,
                                parse_dates=True,
                                index_col="date")

    colsToInterpolate = ['open', 'high', 'low', 'close', 'adj_close', 'volume']

    #Interpolate
    dataFrame = interpolate(dataFrame, colsToInterpolate)
    testDataFrame = interpolate(testDataFrame, colsToInterpolate)

    #Preprocessing
    dataFrame = preprocessing(dataFrame)
    attributes = dataFrame.drop('close', axis=1)
    target = dataFrame.loc[:, 'close']
    testDataFrame = preprocessing(testDataFrame)
    testAttributes = testDataFrame.drop('close', axis=1)
    testTarget = testDataFrame.loc[:, 'close']

    #Normalization - converting values to comparable range
    attributes['volume'] /= 100000
    testAttributes['volume'] /= 100000

    #Predictions and errors of different algorithms for news
    errorTimeSeries, predictionTimeSeries = predictFromTimeSeries(
        attributes, target, testAttributes, testTarget, nCpuCores)

    #News Analysis
    newsDataFrame = pd.read_csv(newsFilePath,
                                parse_dates=True,
                                index_col='date')
    testNewsDataFrame = pd.read_csv(testNewsFilePath,
                                    parse_dates=True,
                                    index_col='date')

    #Cleaning of textual data
    newsAttributes, newsTarget = clean(newsDataFrame)
    newsTestAttributes, newsTestTarget = clean(testNewsDataFrame)

    #Embeddings
    newsAttributes, newsTestAttributes = embeddings(newsAttributes,
                                                    newsTestAttributes)

    #Predictions and errors of different algorithms for news
    errorNews, predictionNews = predictFromNews(newsAttributes, newsTarget,
                                                newsTestAttributes,
                                                newsTestTarget, nCpuCores)

    combinePredictions(errorTimeSeries, errorNews, predictionTimeSeries,
                       predictionNews, testTarget)

예제 #18

0

파일 보기

"""
  Runs the project.
"""

# Import local methods.
import loading
import cleaning
import features
import training

# Load raw data into datasets.
loading.load()

# Clean data.
cleaning.clean()

# Create features.
features.create_features()

# Train neural network.
training.train_neural_network()

# Train logistic regression.
training.train_logistic_regression()