Esempio n. 1
0
def pipeline(website, filename):

    file = open(filename, "w", encoding="utf-8")
    print("Crawling website: ", website)

    web_name = url_cleaning(website)
    website2mcc = read_website2mcc()

    print("url_check: ", url_check(website))
    if url_check(website):
        content = get_content(website)

        if content is not None:
            print("web content is not None")
            content_processed = clean_data(content)
            file.write('%s, %s\n' % (website, content_processed))

        #get url list in the same domain
        urls = get_urls(website)
        print("number of urls: ", len(urls))
        for url in urls:
            if url_check(url):
                content = get_content(url)
                if content is not None:
                    content_processed = clean_data(content)
                    file.write('%s, %s\n' % (url, content_processed))
                    print("preprocessed ", url)
def main():

    tweets = tweet_dict(twitterData)  #contains tweets
    sentiment = arg_dict.sentiment_dict(
        sentimentData)  #contains dictionary of scores
    space = ['']
    for index in range(len(tweets)):

        tweet_word = tweets[index]["text"].split(
        )  #tokenizing every word of tweet
        tweet_word = preprocess.clean_stopwords(
            tweet_word)  #removing stopwords from list of words
        sent_score = 0  #initially sentiment score is 0
        for word in tweet_word:  #accessing tweet word by word
            word = word.lower(
            )  #converting word to lower case because all words in sentiment file are in lower case
            word = preprocess.clean_data(
                word)  #removing punctuations and url's from tweets

            if not (word.encode('utf-8', 'ignore') == ""):
                if word.encode('utf-8') in sentiment.keys(
                ):  #checking if word from tweet is present in sentiment file
                    sent_score = sent_score + int(
                        sentiment[word])  #calculating sentiment score of word

                else:
                    sent_score = sent_score

            if word not in space:
                print(word.encode("utf-8"),
                      int(sent_score))  #printing the result to stdout
Esempio n. 3
0
async def predict(text: TextSample, request: Request):
    # text = request.json["text"]
    text = str(text)
    text_str = text.split('=')[1].replace("'", "")
    # print('text is  :',text_str)
    # print(type(text_str))
    try:
        text_cleaned = preprocess.clean_data(text_str)
        # print(text_cleaned)
        out = model.predict(text_cleaned)
        # print(type(out))
        # return jsonify({"result":out})
        words = {}
        a1 = []
        a2 = []
        a3 = []
        s = []
        c = []
        ps = []
        # print(out)
        for item in out:

            tag = item['tag'].split('-')
            word = item['word']

            if len(tag) == 2:
                if tag[1] == 'A1':
                    # print(word)
                    a1.append(word)
                    # words['A1'] = a1.append(word)
                elif tag[1] == 'A2':
                    a2.append(word)
                    # words['A2'] = a2.append(word)
                elif tag[1] == 'A3':
                    a3.append(word)
                    # words['A3'] = a3.append(word)
                elif tag[1] == 'C':
                    c.append(word)
                    # words['C'] = c.append(word)
                elif tag[1] == 'S':
                    s.append(word)
                    # words['S'] = s.append(word)
                elif tag[1] == 'PC':
                    ps.append(word)
                # words['PS'] = ps.append(word)

        words['A1'] = " ".join(a1)  # address1
        words['A2'] = " ".join(a2)
        words['A3'] = " ".join(a3)
        words['C'] = " ".join(c)
        words['S'] = " ".join(s)
        words['PS'] = " ".join(ps)
        # print(words)
        json_compatible_item_data = jsonable_encoder(words)
        return JSONResponse(content=json_compatible_item_data)

    except Exception as e:
        print(e)
        return {"result": "Model Failed"}
def predict():
    text = request.json["text"]
    print(text)
    print(type(text))
    try:
        text_cleaned = preprocess.clean_data(text)
        # print(text_cleaned)
        out = model.predict(text_cleaned)
        # print(type(out))
        # return jsonify({"result":out})
        words = {}
        a1 = []
        a2 = []
        a3 = []
        s = []
        c = []
        ps = []
        # print(out)
        for item in out:

            tag = item['tag'].split('-')
            word = item['word']

            if len(tag) == 2:
                if tag[1] == 'A1':
                    # print(word)
                    a1.append(word)
                    # words['A1'] = a1.append(word)
                elif tag[1] == 'A2':
                    a2.append(word)
                    # words['A2'] = a2.append(word)
                elif tag[1] == 'A3':
                    a3.append(word)
                    # words['A3'] = a3.append(word)
                elif tag[1] == 'C':
                    c.append(word)
                    # words['C'] = c.append(word)
                elif tag[1] == 'S':
                    s.append(word)
                    # words['S'] = s.append(word)
                elif tag[1] == 'PC':
                    ps.append(word)
                # words['PS'] = ps.append(word)

        words['A1'] = " ".join(a1)  #address1
        words['A2'] = " ".join(a2)
        words['A3'] = " ".join(a3)
        words['C'] = " ".join(c)
        words['S'] = " ".join(s)
        words['PS'] = " ".join(ps)
        print(words)
        print(type(words))
        return words
    except Exception as e:
        print(e)
        return jsonify({"result": "Model Failed"})
Esempio n. 5
0
def computeSentiment(tweets, sentiments):

    tweet_scores = []
    #term_sentiments={}

    for tweet in tweets:
        tweet_score = 0  # For every tweet set score as 0
        tweet_words = tweet.split()  # Tokenize every tweet
        tweet_words = preprocess.clean_stopwords(
            tweet_words)  # Remove all stowprds from list of tweets

        for word in tweet_words:  # For every word in tweet
            word = word.lower()  # Convert it to lower case
            word = preprocess.clean_data(word)  # Preprocess data

            if word in sentiments:  # If word is present in sentiment file
                word_score = sentiments[
                    word]  # Set word score as sentiment score
                tweet_score += word_score  # Add score to corressponding tweet score

            else:
                word_score = 0
                tweet_score += word_score

                # Add the term and its sentiment to the dictionary
            if word not in term_sentiments.keys():
                term_sentiments[word] = word_score
                # Print term_sentiments
                # Add the tweet and score to tweet_scores
        tweet_scores.append([tweet, tweet_score])

# Print tweet_scores
    for term in term_sentiments:

        # Now for every term in dictionary of terms check if term is in known sentiments
        if term not in sentiments:

            # Unknown terms have a base score of zero and assuming they have occured once
            new_score = 0
            occur = 1

            # Find all tweets that contain new term
            for i in range(0, len(tweet_scores)):
                if term in tweet_scores[i][0]:
                    new_score += tweet_scores[i][1]
                    occur += 1

                    # Normalize the new score by number of occurences
            new_score /= occur
            term_sentiments[term] = new_score

        print term + " " + str(format(term_sentiments[term], '.3f'))
def predict(input):


    for text in input:
        text_cleaned = preprocess.clean_data(text['address'])
        print(text_cleaned)
        out = model.predict(text_cleaned)
        print(out)
        # print(type(out))
        # return jsonify({"result":out})
        words = {}
        a1 = []
        a2 = []
        a3 = []
        s = []
        c = []
        ps = []

        for item in out:

            tag = item['tag'].split('-')
            word = item['word']

            if len(tag) == 2:
                if tag[1] == 'A1':
                    # print(word)
                    a1.append(word)
                    # words['A1'] = a1.append(word)
                elif tag[1] == 'A2':
                    a2.append(word)
                    # words['A2'] = a2.append(word)
                elif tag[1] == 'A3':
                    a3.append(word)
                    # words['A3'] = a3.append(word)
                elif tag[1] == 'C':
                    c.append(word)
                    # words['C'] = c.append(word)
                elif tag[1] == 'S':
                    s.append(word)
                    # words['S'] = s.append(word)
                elif tag[1] == 'PC':
                    ps.append(word)
                # words['PS'] = ps.append(word)

        words['A1'] = " ".join(a1  )  # address1
        words['A2'] = " ".join(a2)
        words['A3'] = " ".join(a3)
        words['C'] = " ".join(c)
        words['S'] = " ".join(s)
        words['PS'] = " ".join(ps)
        print(words)
        return words
Esempio n. 7
0
def process_data(training_data_path=TRAINING_DATA_PATH, data_frame=None):
    """Load data training data from the provided path or take the given data frame and add all features"""
    if data_frame is None:
        data_frame = load_training_data(training_data_path)

    load_config()
    data_frame = clean_data(data_frame)
    data_frame = load_features_async(data_frame)
    for text_feature in ["description", "readme"]:
        if text_feature in data_frame.columns:
            data_frame[text_feature].fillna("", inplace=True)
    data_frame.fillna(0, inplace=True)
    return data_frame
def main():                                                                  # main begins
    
    tweets = tweet_dict(twitterData)                                         # contains tweets
    sentiment = sentiment_dict(sentimentData)                                # contains dictionary of scores
    
    for index in range(len(tweets)):                                         # checking all the index in tweets file
        tweet_word = tweets[index]['text'].split()                           # tokenizing every word of tweet
        tweet_word = preprocess.clean_stopwords(tweet_word)                  # removing stopwords from list of words
        sent_score = 0                                                       # initially sentiment score is 0
        for word in tweet_word:                                              # accessing tweet word by word
            word=word.lower()                                                # converting word to lower case because all words in sentiment file are in lower case
            word=preprocess.clean_data(word)                                 # removing punctuations and url's from tweets

            if not (word.encode('utf-8', 'ignore') == ""):                   # if the scanned word is not the decoded one then ignore it
                if word.encode('utf-8') in sentiment.keys():                 # checking if word from tweet is present in sentiment file
                    sent_score = sent_score + int(sentiment[word])           # calculating sentiment score of word
                        
                else:
                    sent_score = sent_score                                  # if sentiment doesn't match then copy the sent_score
                        
            print ("word:",word.encode("utf-8"),"sentiment_score",int(sent_score))# printing the result to stdout
    plst = list(params.items())

    return plst
    
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score

# global variables
xgb_num_rounds = 500
num_classes = 8

# preprocess data
M = clean_data()
train, test = M.data_split()
columns_to_drop = M.columns_to_drop

# convert data to xgb data structure
xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values)
xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values)    

# get the parameters for xgboost
plst = get_params()

# train model
model = xgb.train(plst, xgtrain, xgb_num_rounds)

# get preds
train_preds = model.predict(xgtrain, ntree_limit=model.best_iteration)
Esempio n. 10
0
    params["eta"] = 0.05
    params["min_child_weight"] = 60
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.5
    params["silent"] = 1
    params["max_depth"] = 9
    plst = list(params.items())

    return plst


# global variables
xgb_num_rounds = 300

# preprocess data
M = clean_data()
train, test = M.data_split()
columns_to_drop = M.columns_to_drop

# get the parameters for xgboost
plst = get_params()

skf = StratifiedKFold(train['Response'].values, n_folds=3, random_state=1234)
scores = []

for train_index, test_index in skf:
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    xgtrain = xgb.DMatrix(X_train.drop(columns_to_drop, axis=1),
                          X_train['Response'].values)
    xgtest = xgb.DMatrix(X_test.drop(columns_to_drop, axis=1),
                         X_test['Response'].values)
Esempio n. 11
0
if __name__ == "__main__":
    stat = None

    print("reading statistic.json")

    with open("statistic.json", "r") as f:
        stat = json.load(f)

    print("reading data.csv")

    df = pd.read_csv("data.csv")

    df = df[:19]
    df = df.astype(np.float64)

    df = clean_data(df)

    print("testing dataframe.....")

    test_data_na(df)

    drop_1 = []
    drop_2 = []
    drop_3 = []

    cols = []

    # start of the 3-set columns
    # df.columns[17]

    for i, col in enumerate(df.columns[17:]):
import preprocess
import numpy as np
import math
import matplotlib.pyplot as plt

num_iterations = 100
num_features = 100
alpha = 0.01
llambda = 1.5

df, probe_df = preprocess.clean_data()
# train_df, test_df = preprocess.split_train_test(df)

train_dict_user_id_to_index = {int(user_id): index for index, user_id in enumerate(df["user_id"].unique())}
train_dict_index_to_user_id = {index: int(user_id) for index, user_id in enumerate(df["user_id"].unique())}

train_dict_movie_id_to_index = {int(movie_id): index for index, movie_id in enumerate(df["movie_id"].unique())}
train_dict_index_to_movie_id = {index: int(movie_id) for index, movie_id in enumerate(df["movie_id"].unique())}


train_num_movies = len(df["movie_id"].unique())
train_num_users = len(df["user_id"].unique())
print("The number of movies in train data: ", train_num_movies)
print("The number of users in train data: ", train_num_users)

# test_num_movies = len(df["movie_id"].unique())
# test_num_users = len(df["user_id"].unique())
# print("The number of movies in test data: ", test_num_movies)
# print("The number of users in test data: ", test_num_users)

train_data = df.values