tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet)
        #remove links with no http (probably unnecessary)
        tweet = re.sub(r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet)
        #remove mentions
        tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet)
        #hashtags are removed by countvectorizer

        filteredTweets.append(tweet)

        if len(filteredTweets) == 0:
            print("Not enough tweets for prediction.")
            continue

    #now we can process the tweet using embeddings.transofrmTextForTraining
    try:
        tweetEmbeddings = embeddings.transformTextForTesting(wordDictionary, tweet_threshold, filteredTweets, "conc")
        print("Embeddings computed.")
    except:
        #most of tweets are ingored for brevity/no embedding correspondence
        print("Not enough tweets for prediction.")
        continue

    scores = {}
    #load the saved ML models
    for trait in ["O","C","E","A","N"]:
        model = joblib.load("Models/SVM_fasttext_conc_"+trait+".pkl")
        mean = np.mean(tweetEmbeddings, axis = 0)
        score = model.predict([mean])
        scores[trait] = float(str(score[0])[0:5])
        print("\tScore for",trait,"is:",str(score[0])[0:5])
Example #2
0
def calc_tweet_personality(sessionID, screen_name, profile_img):

    # load embedding dataset
    curr_path = os.path.dirname(os.path.abspath(__file__))

    dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec"
    wordDictionary = dsu.parseFastText(dataset_path)

    # load predictive models
    models = {}
    for trait in ["O", "C", "E", "A", "N"]:
        models[trait] = joblib.load(curr_path + "/models/model_" + trait +
                                    ".pkl")

    # read tweets
    awsPath = os.path.join(sessionID, screen_name)
    sessionDir = os.environ['SESSIONDIR']
    localPath = os.path.join(sessionDir + '/collection', sessionID)
    if not os.path.exists(localPath):
        try:
            os.makedirs(localPath)
        except:
            pass

    try:
        s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath)
    except:
        raise ValueError('Cannot find the timeline in the remote storage!')

    # process the tweets
    tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt')
    filteredTweets = []
    word_count = 0
    for tweet in open(tweet_file_path, "r", encoding="utf-8"):
        if re.match(r'^(RT)', tweet) or tweet == '\n' \
                or tweet == '' or tweet == ' ':
            continue

        #remove links starting with "http"
        tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet)
        #remove links with no http (probably unnecessary)
        tweet = re.sub(
            r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)',
            " ", tweet)
        #remove mentions
        tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)',
                       " ", tweet)
        #hashtags are removed by countvectorizer
        filteredTweets.append(tweet)

        word_count += len(tweet.split())

        if len(filteredTweets) == 0:
            print("Not enough tweets for prediction.")
            continue

    #now we can process the tweet using embeddings.transofrmTextForTraining
    try:
        tweetEmbeddings = embeddings.transformTextForTesting(
            wordDictionary, 3, filteredTweets, "conc")
    except:
        print("Not enough tweets for prediction.")

    # predict using saved models
    # range is 0 ~ 5
    scores = {}
    for trait in ["O", "C", "E", "A", "N"]:
        model = models[trait]
        preds = model.predict(tweetEmbeddings)
        scores[trait] = float(str(np.mean(np.array(preds)))[0:5])

    jung = ""
    if scores["E"] > 3:
        jung = "E"
    else:
        jung = "I"
    if scores["O"] > 3:
        jung = jung + "N"
    else:
        jung = jung + "S"
    if scores["A"] > 3:
        jung = jung + "F"
    else:
        jung = jung + "T"
    if scores["C"] > 3:
        jung = jung + "J"
    else:
        jung = jung + "P"

    scores["jung"] = jung

    # sort the output
    result = {}
    result['screen_name'] = screen_name
    result['profile_img'] = profile_img
    result['personality'] = {
        "word_count":
        word_count,
        "processed_language":
        "en",
        'personality': [{
            'name': 'Openness',
            'percentile': scores['O'] / 5
        }, {
            'name': 'Conscientiousness',
            'percentile': scores['C'] / 5
        }, {
            'name': 'Extraversion',
            'percentile': scores['E'] / 5
        }, {
            'name': 'Agreeableness',
            'percentile': scores['A'] / 5
        }, {
            'name': 'Emotional range',
            'percentile': scores['N'] / 5
        }]
    }

    # save to json and upload to s3 bucket
    with open(os.path.join(localPath, screen_name + '_twitPersonality.json'),
              'w') as outfile:
        json.dump(result, outfile)
    s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json')

    # delete localPath files
    try:
        os.remove(os.path.join(localPath, screen_name + '_tweets.txt'))
        os.remove(
            os.path.join(localPath, screen_name + '_twitPersonality.json'))
    except:
        # already deleted!
        pass

    print(s3.generate_downloads(awsPath,
                                screen_name + '_twitPersonality.json'))

    return result