def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(
        columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment'])

    i = 0
    for x in range(751):
        with open("../output/" + str(x) + ".json", encoding='utf-8') as f:
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + " " + tags)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"])

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess,
                               [item for item in data[0]["comments"]])
            for result in results:
                if result is not None:
                    commentsText += result[0]
            df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio
                                                            ] + [sentiment]
            # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i])

        # print(df['otherMetadata'].iloc[0])
        print(i)
        i += 1

    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0
                                                    if x == 'N' else 1)

    df['data'] = df['comment'] + ' ' + df['otherMetadata']

    traindf, testdf = train_test_split(df, test_size=0.2)

    x_train, x_test, y_train, y_test = train_test_split(
        df['data'], df['sentiment_one_hot'], test_size=0.2)

    NBModel = NaiveBayesClassifier()
    NBModel.train(x_train, y_train, alpha=1)

    print(y_test)
    # hateVideoComments = df.loc[18]['comment']

    # print(hateVideoComments)
    levelOfHate = NBModel.getHateLevel(x_test)
    print(levelOfHate)
def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(
        columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment'])

    i = 0
    for x in range(751):
        with open("../output/" + str(x) + ".json", encoding='utf-8') as f:
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + " " + tags)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"])

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess,
                               [item for item in data[0]["comments"]])
            for result in results:
                if result is not None:
                    commentsText += result[0]
            df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio
                                                            ] + [sentiment]
            # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i])

        # print(df['otherMetadata'].iloc[0])
        print(i)
        i += 1

    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0
                                                    if x == 'N' else 1)

    df['data'] = df['comment'] + ' ' + df['otherMetadata']

    traindf, testdf = train_test_split(df, test_size=0.2)

    countVec_comment = CountVectorizer()

    countVec_comment.fit(df['data'])

    negative_score_comment = train(traindf, countVec_comment)

    prediction = predict(testdf, negative_score_comment)

    print(classification_report(testdf['sentiment_one_hot'], prediction))

    pool.close()
Beispiel #3
0
def preprocess(item):
    comment = str(item["comment"])
    processed_comment = pr.process(comment)
    if (processed_comment != "None") and (processed_comment is not None) and (
            processed_comment != ""):
        transformed = comment_feature_mapper.transform([processed_comment])
        sentiment = int(comment_sentiModel.predict(transformed))

        PCount = 0
        RECount = 0
        SGCount = 0
        wordCount = 0

        for w in processed_comment.split():
            wordCount = wordCount + 1
            word = w
            if "/" in word:
                word = w[:w.index('/')]
            if word in political_vocabulary:
                PCount = PCount + 1
            elif word in relious_ethnic_vocabulary:
                RECount = RECount + 1
            elif word in sex_gender_vocabulary:
                SGCount = SGCount + 1

        # positive_count = positive_count / word_count
        # negative_count = negative_count / word_count

        # print(PCount, RECount, SGCount, wordCount)
        return processed_comment, sentiment, PCount, RECount, SGCount, wordCount
        # return processed_comment, sentiment
    else:
        return None, None, None, None, None, None
def preprocess(item):
    comment = str(item["comment"])
    processed_comment = pr.process(comment)
    # print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None) and (processed_comment != ""):
        return processed_comment
    else:
        return None
def preprocess(item):
    comment = str(item)
    processed_comment = pr.process(comment)
    # print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None):
        # commentsText += processed_comment
        return processed_comment
    else:
        return None
Beispiel #6
0
def preprocess(item):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    comment = str(item)
    processed_comment = pr.process(comment)
    # print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None):
        # commentsText += processed_comment
        return processed_comment
    else:
        return None
def preprocess(item):
    comment = str(item["comment"])
    processed_comment = pr.process(comment)
    # print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None):
        transformed = comment_feature_mapper.transform([processed_comment])
        sentiment = int(comment_sentiModel.predict(transformed))
        return processed_comment, sentiment
    else:
        return None, None
Beispiel #8
0
def preprocess(item):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    comment = str(item["comment"])
    processed_comment = pr.process(comment)
    # print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None) and (processed_comment.strip() != ''):

        positive_count = 0
        negative_count = 0
        word_count = 0
        comment_neg = processed_comment
        for i, w in enumerate(processed_comment.split()):
            word_count = word_count + 1
            word = w
            if "/" in word:
                word = w[:w.index('/')]
            if word in positive_vocabulary:
                positive_count = positive_count + 1
            elif word in negative_vocabulary:
                negative_count = negative_count + 1
            elif word in negation_words:
                # print("comment :" + comment)
                word = processed_comment.split()[i]
                # print("Negation word :" + word)
                previous_word = word = processed_comment.split()[i - 1]
                new_previous_word = "not_" + previous_word
                # print("Previous Word : " + previous_word)
                before_previous_word = processed_comment[:processed_comment.find(previous_word)]
                # print("Before Previous Word: " + before_previous_word)
                after_previous_word = processed_comment[processed_comment.find(previous_word) + len(previous_word):]
                # print("After previous Word:" + after_previous_word)
                comment_neg = before_previous_word + ' ' + new_previous_word + ' ' + after_previous_word

        positive_count = positive_count / word_count
        negative_count = negative_count / word_count

        # data = processed_comment, [positive_count], [negative_count]

        data = {'preprocessed_text': [comment_neg], 'positive_count': [positive_count],
                'Negative_count': [negative_count]}

        comment_df = pd.DataFrame(data)
        # print(comment_df)
        # comment_df = pd.DataFrame(data, columns=['preprocessed_text', 'positive_count', 'Negative_count'])

        transformed = comment_feature_mapper.transform(comment_df)
        sentiment = int(comment_sentiModel.predict(transformed))
        # print('sentiment: ')
        # print(sentiment)
        # commentsText += processed_comment
        return processed_comment, sentiment
    else:
        return None, None
def image_process():
    #req_data = request.get_json()
    #print(req_data)
    # video_url = req_data['url']
    vid = request.args.get("url")
    # vid= req_data['url']
    print(vid)
    # seed(42)

    keys = pd.read_csv(os.getcwd() + "\\keys.csv", encoding='utf-')
    # url = video_url
    noOfKeys = keys.shape[0]
    print(noOfKeys)
    # print(noOfKeys)

    # video_id = urlparse(url)
    # q = parse_qs(video_id.query)
    # vid = q["v"][0]
    key = keys.iloc[randint(0, noOfKeys - 1), 0]
    print("_______________________________________________")
    print(key)
    # try:
    vc = VideoData(vid, key)
    vc.get_video_comments()
    # except Exception:
    # print("Missing Meta Data")
    # return 0
    urllib.request.urlretrieve(
        "https://img.youtube.com/vi/" + vid + "/hqdefault.jpg", "temp/img.jpg")
    thumbnail_text = imgprocess()
    # thumbnail_text = "Something"
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(6)  # for parallel processing
    df = pd.DataFrame(columns=[
        'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio',
        'pcount', 'recount', 'sgcount'
    ])

    with open("temp/data.json", encoding='utf-8') as f:
        data = json.load(f)
        pcount = 0
        recount = 0
        sgcount = 0
        wordcount = 0
        # global commentsText
        commentsText = ""
        # tags = data[0]["tags"]
        if 'tags' in data[0].keys():
            tags = str(' '.join(data[0]["tags"]))
            # print("tags" + tags)
            otherMetaData = pr.process(data[0]["title"] + " " +
                                       data[0]["description"] + " " + tags +
                                       " " + thumbnail_text)
        else:
            otherMetaData = pr.process(data[0]["title"] + " " +
                                       data[0]["description"] + " " +
                                       thumbnail_text)

        likes = int(data[0]["likeCount"])
        dislikes = int(data[0]["dislikeCount"])
        likeDislikeRatio = str(float(likes / dislikes))
        results = pool.map(preprocess, [item for item in data[0]["comments"]])
        pool.close()
        postiveCount = 1
        negativeCount = 1
        for result in results:
            if result[0] is not None:
                commentsText = commentsText + " " + result[0]
                if result[1] == 1:
                    postiveCount = postiveCount + 1
                else:
                    negativeCount = negativeCount + 1

            if result[2] is not None:
                pcount = pcount + int(result[2])

            if result[3] is not None:
                recount = recount + int(result[3])

            if result[4] is not None:
                sgcount = sgcount + int(result[4])

            if result[5] is not None:
                wordcount = wordcount + int(result[5])

        p_count = str(float((pcount * 100 / wordcount)))
        re_count = str(float((recount * 100 / wordcount)))
        sg_count = str(float((sgcount * 100 / wordcount)))

        posToNegCommentRatio = str(float(postiveCount / negativeCount))
        df.loc[0] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [
            posToNegCommentRatio
        ] + [p_count] + [re_count] + [sg_count]

        # print(df['otherMetadata'].iloc[0])

        # Classify
        # Classification_Model = joblib.load("classification_Lr.pkl")
        # Classification_vectorizer  = joblib.load("vectorizer_mapper.pkl")
        #
        # tranformed = Classification_vectorizer.transform(df)
        # print(tranformed[0][2])
        # prediction = Classification_Model.predict(tranformed)
        # print(prediction)

        keras.backend.clear_session()

        HateDetection_feature_mapper = joblib.load(
            "models/HD_featureMapper_LR.pkl")
        HateDetection_Model = joblib.load("models/HateDetection_LR.pkl")

        tranformed = HateDetection_feature_mapper.transform(df)
        hate = HateDetection_Model.predict(tranformed)
        print("Hate Detection :" + str(hate))
        clf = HateDetection_Model
        z = np.dot(clf.coef_, tranformed.T) + clf.intercept_
        hypo = 1 / (1 + np.exp(-z))
        hate_level = 1 - float(hypo)
        print("Level of hate =" + str(hate_level))
        print(hate_level)

        df['data'] = df['comment'] + " " + df['otherMetadata']

        keras.backend.clear_session()

        Domain_feature_mapper = joblib.load("models/Domain_feature_mapper.pkl")

        Domain_Classification_Model = tf.keras.models.load_model(
            'models/Domain_ann.pkl')

        Domaintranformed = Domain_feature_mapper.transform(df)
        Domain = Domain_Classification_Model.predict_classes(Domaintranformed)

        Domain_label_encoder = joblib.load("models/Domain_label_encoder.pkl")

        domain_label = Domain_label_encoder.inverse_transform(Domain)

        print("Domain :" + str(domain_label))

        print(domain_label)
        print(hate_level)

        if domain_label[0] == 'P':
            domain = 'Political'
        elif domain_label[0] == 'SG':
            domain = 'Sex & Gender'
        elif domain_label[0] == 'RE':
            domain = 'Religious & Ethnic'
        else:
            domain = 'Other'

        if hate[0] == 1:
            sentiment = 'Not Hate'
            # sentiment = 'Low Hate'
        else:
            sentiment = 'Hate'
            # sentiment = 'High Hate'

        x = Decimal(hate_level)
        level_of_hate = float(round(x, 2))
        print(level_of_hate)

        return jsonify(sentiment=sentiment,
                       hateLevel=level_of_hate,
                       category=domain,
                       thumbnail_text=thumbnail_text)
def preprocess(item):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    comment = str(item["comment"])
    processed_comment = pr.process(comment)
    print(processed_comment)
    if (processed_comment != "None") and (processed_comment is not None) and (
            processed_comment != ""):
        # transformed = comment_feature_mapper.transform([processed_comment])
        # sentiment = int(comment_sentiModel.predict(transformed))

        PCount = 0
        RECount = 0
        SGCount = 0
        wordCount = 0
        positive_count = 0
        negative_count = 0
        comment_neg = processed_comment
        for i, w in enumerate(processed_comment.split()):
            wordCount = wordCount + 1
            word = w
            if "/" in word:
                word = w[:w.index('/')]
            if word in positive_vocabulary:
                positive_count = positive_count + 1
            elif word in negative_vocabulary:
                negative_count = negative_count + 1
            elif word in political_vocabulary:
                PCount = PCount + 1
            elif word in relious_ethnic_vocabulary:
                RECount = RECount + 1
            elif word in sex_gender_vocabulary:
                SGCount = SGCount + 1
            elif word in negation_words:
                print("comment :" + comment)
                word = processed_comment.split()[i]
                print("Negation word :" + word)
                previous_word = word = processed_comment.split()[i - 1]
                new_previous_word = "not_" + previous_word
                print("Previous Word : " + previous_word)
                before_previous_word = processed_comment[:processed_comment.
                                                         find(previous_word)]
                # print("Before Previous Word: " + before_previous_word)
                after_previous_word = processed_comment[processed_comment.
                                                        find(previous_word) +
                                                        len(previous_word):]
                # print("After previous Word:" + after_previous_word)
                comment_neg = before_previous_word + ' ' + new_previous_word + ' ' + after_previous_word
        positive_count = positive_count / wordCount
        negative_count = negative_count / wordCount

        data = {
            'preprocessed_text': [comment_neg],
            'positive_count': [positive_count],
            'Negative_count': [negative_count]
        }

        comment_df = pd.DataFrame(data)

        transformed = comment_feature_mapper.transform(comment_df)
        sentiment = int(comment_sentiModel.predict(transformed))

        print(PCount, RECount, SGCount, wordCount)
        return processed_comment, sentiment, PCount, RECount, SGCount, wordCount
        # return processed_comment, sentiment
    else:
        return None, None, None, None, None, None
Beispiel #11
0
def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment'])

    i = 0
    for x in range(1000):
        with open("../output/" + str(x) + ".json", encoding='utf-8') as f:
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            thumbnail_text = df_thumbnail['Thumbnail'].iloc[i]
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags + ' ' + thumbnail_text)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + ' ' + thumbnail_text)

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess, [item for item in data[0]["comments"]])
            postiveCount = 1
            negativeCount = 1
            for result in results:
                if result[0] is not None:
                    commentsText += result[0]
                    if result[1] == 1:
                        postiveCount = postiveCount + 1
                    else:
                        negativeCount = negativeCount + 1
            posToNegCommentRatio = str(float(postiveCount / negativeCount))
            df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [posToNegCommentRatio] + [sentiment]
            print(str(i) + " : " + df['posToNegCommentRatio'].loc[i])

        # print(df['otherMetadata'].iloc[0])

        i += 1

        # Classify
        # Classification_Model = joblib.load("classification_Lr.pkl")
        # Classification_vectorizer  = joblib.load("vectorizer_mapper.pkl")
        #
        # tranformed = Classification_vectorizer.transform(df)
        # print(tranformed[0][2])
        # prediction = Classification_Model.predict(tranformed)
        # print(prediction)
    # print(df)
    pool.close()
    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1)
    df['sentiment_one_level_hate'] = df['sentiment'].apply(lambda x: 1 if x == 'N' else 0)

    # mapper = DataFrameMapper([
    #     (['posToNegCommentRatio'], StandardScaler()),
    #     ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)),
    #     (['likeDislikeRatio'], StandardScaler()),
    # ])

    mapper = DataFrameMapper([
        (['posToNegCommentRatio'], StandardScaler()),
        ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)),
        (['likeDislikeRatio'], StandardScaler()),
    ])


    mapper.fit(df)
    label = df['sentiment_one_hot']

    features = mapper.transform(df)

    # x, x_test, y, y_test = train_test_split(features, label, test_size=0.2, train_size=0.8, random_state=0)
    print("logistic regression")
    clf = LogisticRegression()
    clf.fit(features, label)

    # predicted = clf.predict(x_test)

    # print(classification_report(y_test, predicted))


    # x1, x_test1, y1, y_test1 = train_test_split(features, df['sentiment_one_level_hate'], test_size=0.2, train_size=0.8, random_state=0)
    print("logistic regression")
    clf_level = LogisticRegression()
    clf_level.fit(features, df['sentiment_one_level_hate'])

    # predicted1 = clf_level.predict(x_test1)

    # print(classification_report(y_test1, predicted1))


    # print("SVM")
    #
    # SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    # SVM.fit(x_test, y)
    # # predict the labels on validation dataset
    # predictions_SVM = SVM.predict(x_test)
    # print(classification_report(y_test, predictions_SVM))
    # print("Random Forest")
    # clf = RandomForestClassifier(n_estimators = 300, criterion = "entropy", random_state = 0)
    # clf.fit(x, y)
    # predicted = clf.predict(x_test)
    # print(classification_report(y_test, predicted))

    joblib.dump(clf, "HateDetection_LR.pkl")
    joblib.dump(clf_level, "Level_of Hate_LR.pkl")
    joblib.dump(mapper, "HD_featureMapper_LR.pkl")
Beispiel #12
0
def main1():
    # seed(42)

    keys = pd.read_csv(os.getcwd() + "\\keys.csv", encoding='utf-')
    url = "https://www.youtube.com/watch?v=kzNC5163qHk"  # "https://www.youtube.com/watch?v=feY49cKUlB0"
    noOfKeys = keys.shape[0]

    # print(noOfKeys)

    video_id = urlparse(url)
    q = parse_qs(video_id.query)
    vid = q["v"][0]
    key = keys.iloc[randint(0, noOfKeys), 0]
    print("_______________________")
    print(key)

    vc = VideoData(vid, key)
    vc.get_video_comments()
    urllib.request.urlretrieve(
        "https://img.youtube.com/vi/" + vid + "/hqdefault.jpg", "temp/img.jpg")
    imgprocess()

    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(1)  # for parallel processing
    df = pd.DataFrame(columns=[
        'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio',
        'pcount', 'recount', 'sgcount'
    ])

    with open("temp/data.json", encoding='utf-8') as f:
        data = json.load(f)
        pcount = 0
        recount = 0
        sgcount = 0
        wordcount = 0
        # global commentsText
        commentsText = ""
        # tags = data[0]["tags"]
        if 'tags' in data[0].keys():
            tags = str(' '.join(data[0]["tags"]))
            # print("tags" + tags)
            otherMetaData = pr.process(data[0]["title"] + " " +
                                       data[0]["description"] + " " + tags)
        else:
            otherMetaData = pr.process(data[0]["title"] + " " +
                                       data[0]["description"])

        likes = int(data[0]["likeCount"])
        dislikes = int(data[0]["dislikeCount"])
        likeDislikeRatio = str(float(likes / dislikes))
        results = pool.map(preprocess, [item for item in data[0]["comments"]])
        pool.close()

        # results.remove(None)
        # results = filter(None, results)
        # for item in data[0]["comments"]:
        #    result =[pool.map(preprocess(item))]
        # results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data]
        # comment = str(item["comment"])
        # processed_comment = pr.process(comment)
        # print(processed_comment)
        # if(processed_comment != "None") and (processed_comment is not None):
        #     commentsText += processed_comment
        # # commentSentimentModel(comment)

        # print(results)
        postiveCount = 1
        negativeCount = 1
        for result in results:
            if result[0] is not None:
                commentsText = commentsText + " " + result[0]
                if result[1] == 1:
                    postiveCount = postiveCount + 1
                else:
                    negativeCount = negativeCount + 1

            if result[2] is not None:
                pcount = pcount + int(result[2])

            if result[3] is not None:
                recount = recount + int(result[3])

            if result[4] is not None:
                sgcount = sgcount + int(result[4])

            if result[5] is not None:
                wordcount = wordcount + int(result[5])

        p_count = str(float((pcount * 100 / wordcount)))
        re_count = str(float((recount * 100 / wordcount)))
        sg_count = str(float((sgcount * 100 / wordcount)))

        posToNegCommentRatio = str(float(postiveCount / negativeCount))
        df.loc[0] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [
            posToNegCommentRatio
        ] + [p_count] + [re_count] + [sg_count]

        # print(df['otherMetadata'].iloc[0])

        # Classify
        # Classification_Model = joblib.load("classification_Lr.pkl")
        # Classification_vectorizer  = joblib.load("vectorizer_mapper.pkl")
        #
        # tranformed = Classification_vectorizer.transform(df)
        # print(tranformed[0][2])
        # prediction = Classification_Model.predict(tranformed)
        # print(prediction)
        print(df.loc[0])
        HateDetection_feature_mapper = joblib.load(
            "models/HD_featureMapper_LR.pkl")
        HateDetection_Model = joblib.load("models/HateDetection_LR.pkl")

        tranformed = HateDetection_feature_mapper.transform(df)
        hate = HateDetection_Model.predict(tranformed)
        print("Hate Detection :" + str(hate))
        clf = HateDetection_Model
        z = np.dot(clf.coef_, tranformed.T) + clf.intercept_
        hypo = 1 / (1 + np.exp(-z))
        hate_level = hypo
        print("Level of hate =" + str(hypo))

        df['data'] = df['comment'] + " " + df['otherMetadata']

        keras.backend.clear_session()

        Domain_feature_mapper = joblib.load("models/Domain_feature_mapper.pkl")

        Domain_Classification_Model = tf.keras.models.load_model(
            'models/Domain_ann.pkl')

        Domaintranformed = Domain_feature_mapper.transform(df)
        Domain = Domain_Classification_Model.predict_classes(Domaintranformed)
        print(Domain_Classification_Model.predict(Domaintranformed))

        Domain_label_encoder = joblib.load("models/Domain_label_encoder.pkl")

        domain_label = Domain_label_encoder.inverse_transform(Domain)

        print("Domain :" + str(domain_label))

    print("Done")
def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(columns=[
        'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio',
        'sentiment'
    ])

    i = 0
    for x in range(751):
        with open("../../output/" + str(x) + ".json", encoding='utf-8') as f:
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + " " + tags)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"])

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess,
                               [item for item in data[0]["comments"]])
            postiveCount = 1
            negativeCount = 1
            for result in results:
                if result[0] is not None:
                    commentsText += result[0]
                    if result[1] == 1:
                        postiveCount = postiveCount + 1
                    else:
                        negativeCount = negativeCount + 1
            posToNegCommentRatio = str(float(postiveCount / negativeCount))
            df.loc[i] = [commentsText] + [otherMetaData] + [
                likeDislikeRatio
            ] + [posToNegCommentRatio] + [sentiment]
            print(str(i) + " : " + df['posToNegCommentRatio'].loc[i])

        # print(df['otherMetadata'].iloc[0])

        i += 1

        # Classify
        # Classification_Model = joblib.load("classification_Lr.pkl")
        # Classification_vectorizer  = joblib.load("vectorizer_mapper.pkl")
        #
        # tranformed = Classification_vectorizer.transform(df)
        # print(tranformed[0][2])
        # prediction = Classification_Model.predict(tranformed)
        # print(prediction)
    # print(df)

    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0
                                                    if x == 'N' else 1)

    mapper = DataFrameMapper([
        (['posToNegCommentRatio'], StandardScaler()),
        ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3),
                                          max_features=5000)),
        (['likeDislikeRatio'], StandardScaler()),
    ])

    mapper.fit(df)

    features = mapper.transform(df)

    label = df['sentiment_one_hot']

    x, x_test, y, y_test = train_test_split(features,
                                            label,
                                            test_size=0.2,
                                            train_size=0.8,
                                            random_state=0)

    clf = LogisticRegression()
    clf.fit(x, y)

    predicted = clf.predict(x_test)

    print(classification_report(y_test, predicted))

    pool.close()
Beispiel #14
0
def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(columns=[
        'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio',
        'sentiment', 'category', 'pcount', 'recount', 'sgcount'
    ])
    # df = pd.DataFrame(
    #     columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment', 'category'])

    i = 0
    for x in range(1000):
        with open("../../output/" + str(x) + ".json", encoding='utf-8') as f:
            pcount = 0
            recount = 0
            sgcount = 0
            wordcount = 0
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            category = data[0]['category']
            print(category)
            thumbnail_text = df_thumbnail['Thumbnail'].iloc[i]
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + " " +
                                           tags + ' ' + thumbnail_text)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + ' ' +
                                           thumbnail_text)

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess,
                               [item for item in data[0]["comments"]])
            postiveCount = 1
            negativeCount = 1
            for result in results:
                if result[0] is not None:
                    commentsText = commentsText + " " + result[0]
                    if result[1] == 1:
                        postiveCount = postiveCount + 1
                    else:
                        negativeCount = negativeCount + 1

                if result[2] is not None:
                    pcount = pcount + int(result[2])

                if result[3] is not None:
                    recount = recount + int(result[3])

                if result[4] is not None:
                    sgcount = sgcount + int(result[4])

                if result[5] is not None:
                    wordcount = wordcount + int(result[5])

            p_count = str(float((pcount * 100 / wordcount)))
            re_count = str(float((recount * 100 / wordcount)))
            sg_count = str(float((sgcount * 100 / wordcount)))

            posToNegCommentRatio = str(float(postiveCount / negativeCount))
            df.loc[i] = [commentsText] + [
                otherMetaData
            ] + [likeDislikeRatio] + [posToNegCommentRatio] + [sentiment] + [
                category
            ] + [p_count] + [re_count] + [sg_count]
            print(str(i) + " : " + df['posToNegCommentRatio'].loc[i])

        i += 1

    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0
                                                    if x == 'N' else 1)

    df['data'] = df['comment'] + " " + df['otherMetadata']

    mapper = DataFrameMapper([
        ('data', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)),
        (['pcount'], StandardScaler()),
        (['recount'], StandardScaler()),
        (['sgcount'], StandardScaler()),
    ])

    mapper.fit(df)
    data = mapper.transform(df)
    # joblib.dump(mapper, "Domain_feature_mapper.pkl")

    # Synthetic Minority Over-sampling Technique
    smote = SMOTE('minority')
    X_sm, Y_sm = smote.fit_sample(data, df['category'])

    encoder = LabelEncoder()
    encoder.fit(Y_sm)
    encoded_Y = encoder.transform(Y_sm)

    joblib.dump(encoder, "Domain_label_encoder.pkl")
    print(encoded_Y)

    dummy_y = np_utils.to_categorical(encoded_Y)
    print(dummy_y)
    X_train, X_test, y_train, y_test = train_test_split(X_sm,
                                                        dummy_y,
                                                        test_size=0.2)

    # X_train, X_test, y_train, y_test = train_test_split(data, df['category'], test_size=0.2)

    pool.close()

    model_DNN = Build_Model_DNN_Text(X_train.shape[1], 4)
    history_dropout = model_DNN.fit(X_train,
                                    y_train,
                                    validation_data=(X_test, y_test),
                                    epochs=20,
                                    batch_size=32)

    predicted = model_DNN.predict(X_test)
    y_pred_vector = np.argmax(predicted, axis=1)
    y_test_vector = np.argmax(y_test, axis=1)
    print(classification_report(y_test_vector, y_pred_vector))

    loss = history_dropout.history['loss']
    val_loss = history_dropout.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, 'y', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    acc = history_dropout.history['acc']
    val_acc = history_dropout.history['val_acc']
    plt.plot(epochs, acc, 'y', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

    model_DNN.save("Domain_ann.pkl")