Ejemplo n.º 1
0
def main(args):
    # Load the tokenizer
    tokenizer = pickle.load(args.tokenizer)
    # Load the model
    model = pickle.load(args.model)

    with jsonlines.open(args.input) as reader:
        with jsonlines.open(args.output, "w") as writer:
            i = 0
            while i < args.limit:
                tweet = reader.read()
                # Preprocess the tweet text
                text = preprocess_tweet(tweet)
                # Turn it into an input vector
                vector = tokenizer.transform([text])
                vector = vector.toarray()
                # Feed it into the model
                sentiment = model.predict(vector)
                # Register the sentiment
                #tweet["sentiment"] = int(np.round(sentiment[0]))
                tweet["sentiment"] = float(sentiment[0])
                # Register the tweet into the writer
                writer.write(tweet)
                i += 1
                if i % 500000 == 0:
                    print(i)
def process_files(num, files, clust_num):
    df = pd.DataFrame(columns=['created_at', 'preprocessed_text'])
    # parrelelize inside the function V
    for filename in files:
        listoftweets = []
        with gzip.open(filename, "rt") as f:
            file_string = f.read().split("\n")
            for t in file_string:
                listoftweets.append(json.loads(t))
        notext = 0
        for jsonObj in listoftweets:
            #print(list(jsonObj.keys()))
            if "text" in jsonObj.keys():
                thetext = jsonObj[
                    "text"]  #take out text for space later, original text for debugging
                thedate = jsonObj["created_at"]
                preprocessed_text = preprocessing.preprocess_tweet(thetext)
                new_row = {
                    'created_at': thedate,
                    'preprocessed_text': preprocessed_text
                }
                if (preprocessed_text):
                    df = df.append(new_row, ignore_index=True)
            else:
                notext += 1
    df.to_csv("../reorganized_data/cluster" + str(clust_num) + "/output" +
              str(num) + ".csv")
    print("Completed task.")
Ejemplo n.º 3
0
def main(args):
    # Load the tokenizer
    tokenizer = pickle.load(args.tokenizer)
    # Load the model
    model = pickle.load(args.model)
    
    with jsonlines.open(args.input) as reader:
        with jsonlines.open(args.output, "w") as writer:
            for tweet in reader:
                # Preprocess the tweet text
                text = preprocess_tweet(tweet)
                # Turn it into an input vector
                vector = tokenizer.transform([text])
                vector = vector.toarray()
                # Feed it into the model
                sentiment = model.predict(vector) 
                # Register the sentiment
                tweet["sentiment"] = int(np.argmax(sentiment[0]))
                # Register the tweet into the writer
                writer.write(tweet)
def get_linguistic_features(sentence):
    '''
    Features:
    1: count of positive
    2: count of negative
    3: count of intensity words: very good, so cool
    4: count of elongated words: greeeeat, sooo good
    5: has a question mark
    6: has exclamation mark
    7: has exclamation repeated more than once!!!
    8: contains hashtag
    
    If you're interested in the usefulness of each of these,
    the following are the coefficients for a logistic regression
    model for the negative, neutral, and positive classes respectively
    [[ 0.35346987  4.92481448  1.80200111  2.88740596  1.34519168 -0.06794115
       1.20213894 -4.38018904]
     [-2.13637061 -3.40506267 -2.38370639 -1.82043459  0.25270369 -2.41349426
      -3.46977753  0.79135405]
     [ 2.92176862 -1.57157761  1.50862341  1.11264888 -2.41054806  3.26814909
       1.841855    3.13502001]]
    
    So many interesting insights can be seen from these. e.g.
    see how the coefficient of the hashtag is so big and leans
    toward classifying the tweet as positive?
    '''
    features = [0.0 for _ in range(NUM_LINGUISTIC_FEATURES)]
    
    preprocessed = preprocess_tweet(sentence)
    
    positives_count = 0
    negatives_count = 0
    intensity_count = 0
    elongated_count = 0
    for i, w in enumerate(preprocessed):
        # first make sure there's no negation before the word
        # if there's one, then we'll flip the sentiment of the word
        prev_is_negation = False
        if i > 0 and preprocessed[i-1] in negation_words:
            prev_is_negation = True
        
        if w in positive_words:
            if prev_is_negation:
                negatives_count += 1
            else:
                positives_count += 1
        elif w in negative_words:
            if prev_is_negation:
                positives_count += 1
            else:
                negatives_count += 1
        
        if w in intensity_words:
            intensity_count += 1
        
        if is_elongated(w):
            elongated_count += 1
    
    features[0] = positives_count
    features[1] = negatives_count
    features[2] = intensity_count
    features[3] = elongated_count
    
    if '?' in sentence: features[4] = 1
    if '!' in sentence:
        features[5] = 1
        if multi_exclamation_regex.search(sentence):
            features[6] = 1
    if '|||HASHTAG|||' in preprocessed:
        features[7] = 1
    
    return np.asarray(features, dtype=np.float32)
Ejemplo n.º 5
0
    version='2018-05-01',
    authenticator=authenticator_2
)

language_translator.set_service_url('https://api.eu-gb.language-translator.watson.cloud.ibm.com/instances/6604c96d-44f1-4854-babb-582d45f73101')
language_translator_2.set_service_url('https://api.eu-gb.language-translator.watson.cloud.ibm.com/instances/9ca1afb5-ffcf-45a8-a0b7-e02551a45cfa')


tweets = pd.read_csv('../data/tweets.csv')

data = []
col_names = ['id', 'text_en']
for index, tweet in tweets.iterrows():
  # Start from where it broke last time if translation fails somewhere along the process
  if index > 0:
    pre = preprocess_tweet(tweet.text)
    translation = ''
    # Switch translator every tweet
    if (index % 2) == 0:
      tl1 = language_translator.translate(
        text=pre,
        source='nl', target='en').get_result()
      translation = tl1['translations'][0]['translation']
    else:
      tl2 = language_translator.translate(
        text=pre,
        source='nl', target='en').get_result()
      translation = tl2['translations'][0]['translation']
    
    print(index)
    # Append translation to file
Ejemplo n.º 6
0
        raise Exception("No data directory was found at %s or %s"%(original, parent))
    else:
        os.chdir(os.getcwd()+'/data')

else:
    os.chdir(os.getcwd()+'/data')

tweets_per_day = defaultdict(list)

tweets_en = pd.read_csv('tweets_en.csv')
tweet_sentiments = []
sentiment_analyser = SentimentAnalyser()

for index, tweet in tweets_en.iterrows():
  #process row
  tweets_per_day[tweet['date']].append(preprocess_tweet(tweet['text_en']))
  sentiment = sentiment_analyser.get_sentiment(tweet['text_en'])
  tweet_sentiments.append([tweet['id'],tweet['date'],tweet['text_en'],sentiment])

sent_df = pd.DataFrame(data=tweet_sentiments, columns=['id', 'date', 'text_en', 'sentiment'])
sent_df.to_csv('tweets_sentiment')
sentiments_per_day = {key:0 for key in tweets_per_day.keys()}
import numpy as np
bad_tweets = []
for key in tweets_per_day.keys():
    tweets_of_day = tweets_per_day[key]
    sentiments = []
    for tweet in tweets_of_day:
        try:
            sentiments.append(sentiment_analyser.get_sentiment(tweet))
        except: