def parse_tweets(tweet):
	if len(tweet) > 0:
		keep_tweet = tweet
		tweet = tweet[0].split()
		cleaned_tweet = tweet_cleaner.clean_tweet(tweet)
		words = []
		final_words = []
		for word in cleaned_tweet:
			if len(word) > 2:
				checking_words = tweet_cleaner.check_words(word)
				if checking_words != 0:
					words.append(checking_words)
		for word in words:
			# stripping word twice to account for people's weird punctuation errors
			word = tweet_cleaner.strip_word(word)
			word = tweet_cleaner.strip_word(word)
			if len(word) > 2:
				checking_words = tweet_cleaner.check_words(word)
				if checking_words != 0:
					final_words.append(checking_words)
	words_in_game = []
	if len(final_words) > 0:
		for word in final_words:
			if word > 0:
		 		# check Wordnik
				results = wordnik_api.check_all(word)
				if results != None:
					# if Wordnik never returned a result > 0, put it into the game
					words_in_game.append(word)
		if len(words_in_game) > 0:
			model.set_game_words_tweets(words_in_game, keep_tweet)
def process_json(screen_name):
    workbook = xlsxwriter.Workbook('%s_tweets.xlsx' % screen_name)
    worksheet = workbook.add_worksheet()
    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0
    worksheet.write(row, 0, 'id')
    worksheet.write(row, 1, 'created_at')
    worksheet.write(row, 2, 'full_text')
    worksheet.write(row, 3, 'clean_text')
    row += 1

    with open('{}.json'.format(screen_name)) as json_reader:
        lines = json_reader.readlines()
        for line in lines:
            json_tweet = json.loads(line)
            if 'retweeted_status' in json_tweet:
                text = json_tweet['retweeted_status']['full_text']
            else:
                text = json_tweet['full_text']
            clean_text = tweet_cleaner.clean_tweet(text)
            clean_text = tweet_cleaner.normalize_arabic(clean_text)
            clean_text = tweet_cleaner.remove_repeating_char(clean_text)
            clean_text = tweet_cleaner.keep_only_arabic(clean_text.split())
            worksheet.write(row, col, json_tweet['id_str'])
            worksheet.write(row, col + 1, json_tweet['created_at'])
            worksheet.write(row, col + 2, text)
            worksheet.write(row, col + 3, clean_text)
            row += 1
        workbook.close()
def process_json(filters, json_file, xls_file):
    import preprocessor
    workbook = xlsxwriter.Workbook(xls_file)
    worksheet = workbook.add_worksheet()

    # Start from the first cell. Rows and columns are zero indexed.

    row = 0
    col = 0
    worksheet.write(row, 0, 'id')
    worksheet.write(row, 1, 'created_at')
    worksheet.write(row, 2, 'text')
    worksheet.write(row, 3, 'clean_text')
    row += 1

    lines = json_file.readlines()
    for line in lines:
        json_tweet = json.loads(line)
        if 'retweeted_status' in json_tweet:
            text = json_tweet['retweeted_status']['text']
        else:
            text = json_tweet['text']
        clean_text = tweet_cleaner.clean_tweet(text)
        clean_text = tweet_cleaner.normalize_arabic(clean_text)
        clean_text = tweet_cleaner.remove_repeating_char(clean_text)
        clean_text = tweet_cleaner.keep_only_arabic(clean_text.split())
        is_filtered = filter_tweet(filters, clean_text)
        if not is_filtered:  # print only if text is not filtered
            worksheet.write(row, col, json_tweet['id_str'])
            worksheet.write(row, col + 1, json_tweet['created_at'])
            worksheet.write(row, col + 2, text)
            worksheet.write(row, col + 3, clean_text)
            row += 1
    workbook.close()
Beispiel #4
0
def parse_tweets(tweet):
    if len(tweet) > 0:
        keep_tweet = tweet
        tweet = tweet[0].split()
        cleaned_tweet = tweet_cleaner.clean_tweet(tweet)
        words = []
        final_words = []
        for word in cleaned_tweet:
            if len(word) > 2:
                checking_words = tweet_cleaner.check_words(word)
                if checking_words != 0:
                    words.append(checking_words)
        for word in words:
            # stripping word twice to account for people's weird punctuation errors
            word = tweet_cleaner.strip_word(word)
            word = tweet_cleaner.strip_word(word)
            if len(word) > 2:
                checking_words = tweet_cleaner.check_words(word)
                if checking_words != 0:
                    final_words.append(checking_words)
    words_in_game = []
    if len(final_words) > 0:
        for word in final_words:
            if word > 0:
                # check Wordnik
                results = wordnik_api.check_all(word)
                if results != None:
                    # if Wordnik never returned a result > 0, put it into the game
                    words_in_game.append(word)
        if len(words_in_game) > 0:
            model.set_game_words_tweets(words_in_game, keep_tweet)
Beispiel #5
0
def do_embedding():
    data_pd = pd.read_csv(
        DATA_FILE_ORIGINAL_TWEETS,
        sep=',',
        skipinitialspace=True,
    )

    data_set = pd.DataFrame(data_pd)
    # print(data_set.describe())
    # print(data_pd.__len__())
    # print(data_set.head())

    col = data_pd['tweet_text']

    # prepare an empty list
    tweet_list = list()

    tweets = data_set['tweet_text'].values.tolist()
    for tweet in tweets[0:params.NBR_TWEETS]:
        print(tweet)

        words = tweet_cleaner.clean_tweet(tweet)

        tweet_list.append(words)
    len(tweet_list)

    model = gensim.models.Word2Vec(sentences=tweet_list,
                                   size=params.EMBEDDING_DIM,
                                   workers=4,
                                   min_count=params.EMBEDDING_MIN_SIZE)
    model.save(STATE_FILE_ORIGINAL_TWEETS)
Beispiel #6
0
def extract_tweets_from_json(json_reader, text_writer):
    json_tweets = json_reader.readlines()
    print('tweets in json file: {} tweets'.format(len(json_tweets)))
    tweets_list = list()
    extracted_tweets_count = 0
    if args.include_id:
        text_writer.write("id\ttweet\n")
    for json_tweet in json_tweets:
        try:
            if json_tweet:
                # load it as Python dict
                tweet = json.loads(json_tweet)
                tid = tweet['id']
                text = tweet['text']
                text = tweet_cleaner.clean_tweet(text)
                if args.normalize:
                    text = tweet_cleaner.normalize_arabic(text)
                if args.remove_repeated_letters:
                    text = tweet_cleaner.remove_repeating_char(text)
                if args.keep_only_arabic:
                    text = tweet_cleaner.keep_only_arabic(text.split())
                if len(text.split()) > 2:
                    if args.exclude_redundant:
                        if text not in tweets_list:
                            tweets_list.append(text)
                            if args.include_id:
                                text_writer.write(
                                    str(tid) + "\t" + text + "\n")
                                # print('id:{}'.format(str(tid)))
                                # print('text:', text)
                                # print('tweet:', tweet['text'])
                                # input("press any key...")
                            else:
                                text_writer.write(text + "\n")
                            extracted_tweets_count += 1
                    else:
                        if args.include_id:
                            text_writer.write(str(tid) + "\t" + text + "\n")
                        else:
                            text_writer.write(text + "\n")
                        extracted_tweets_count += 1
        except json.decoder.JSONDecodeError as error:
            pass
        except UnicodeDecodeError as error:
            pass
    print('extracted tweets: {} tweets'.format(extracted_tweets_count))
Beispiel #7
0
def tweets_2_vectors():
    # This is the vector of documents / tweets
    document_vector_list = []

    #load the model back
    model = gensim.models.Word2Vec.load("state_files/w2v.model")

    data_pd = pd.read_csv(
        'data/tweets.csv',
        sep=',',
        skipinitialspace=True,
    )

    data_set = pd.DataFrame(data_pd)
    col = data_pd['tweet_text']

    # prepare an empty list
    tweet_list = list()

    tweets = data_set['tweet_text'].values.tolist()

    for tweet in tweets[0:40000]:
        words = tweet_cleaner.clean_tweet(tweet)

        count = 0
        tweet_vector = []
        for w in words:
            if w in model:
                if count == 0:
                    tweet_vector = model[w]
                else:
                    tweet_vector = tweet_vector + model[w]
                count = count + 1

        if len(tweet_vector) > 0:
            document_vector_list.append(tweet_vector)

    document_vector_array = np.array(document_vector_list)

    pickle.dump(document_vector_array,
                open("state_files/tweets_vector.dat", "wb"))
Beispiel #8
0
    def tweets_2_vectors(
            self,
            active,
            state_file_tweets_vector='state_files/tweets_vector.dat'):
        self.state_file_tweets_vector = state_file_tweets_vector
        if active:
            # This is the vector of documents / tweets
            tweets_vector_list = []

            # load the model back
            model = gensim.models.Word2Vec.load(self.model_file_w2v)

            data_pd = pd.read_csv(
                self.data_file_tweets,
                sep=',',
                skipinitialspace=True,
            )
            data_set = pd.DataFrame(data_pd)
            tweets = data_set['tweet_text'].values.tolist()

            for tweet in tweets[0:self.nbr_of_tweets]:
                words = tweet_cleaner.clean_tweet(tweet)

                count = 0
                tweet_vector = []
                for w in words:
                    if w in model:
                        if count == 0:
                            tweet_vector = model[w]
                        else:
                            tweet_vector = tweet_vector + model[w]
                        count = count + 1

                if len(tweet_vector) > 0:
                    tweets_vector_list.append(tweet_vector)

            document_vector_array = np.array(tweets_vector_list)

            pickle.dump(document_vector_array,
                        open(self.state_file_tweets_vector, "wb"))
Beispiel #9
0
    def word_embedding(self,
                       active,
                       embedding_dim=100,
                       embedding_min_size=30,
                       model_file_w2v='state_files/w2v.model'):
        self.embedding_dim = embedding_dim
        self.embedding_min_size = embedding_min_size
        self.model_file_w2v = model_file_w2v

        if active:
            data_pd = pd.read_csv(
                self.data_file_tweets,
                sep=',',
                skipinitialspace=True,
            )

            data_set = pd.DataFrame(data_pd)
            # print(data_set.describe())
            # print(data_pd.__len__())
            # print(data_set.head())

            # prepare an empty list
            tweet_list = list()

            tweets = data_set['tweet_text'].values.tolist()
            for tweet in tweets[0:self.nbr_of_tweets]:
                print(tweet)

                words = tweet_cleaner.clean_tweet(tweet)

                tweet_list.append(words)
            len(tweet_list)

            model = gensim.models.Word2Vec(sentences=tweet_list,
                                           size=self.embedding_dim,
                                           workers=4,
                                           min_count=self.embedding_min_size)
            model.save(self.model_file_w2v)
Beispiel #10
0
def get_all_tweets(screen_name):
    # Twitter only allows access to a users most recent 3240 tweets with this method

    # authorize twitter, initialize tweepy
    if api_keys.consumer_key == '' or api_keys.consumer_secret == '' \
            or api_keys.access_token == '' or api_keys.access_secret == '':
        print("API key not found. Please check api_keys.py file")
        sys.exit(-1)
    auth = tweepy.OAuthHandler(api_keys.consumer_key, api_keys.consumer_secret)
    auth.set_access_token(api_keys.access_token, api_keys.access_secret)
    api = tweepy.API(auth)

    # initialize a list to hold all the tweepy Tweets
    alltweets = []

    # make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    # save most recent tweets
    alltweets.extend(new_tweets)

    # save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    # keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print("getting tweets before {}".format(oldest))

        # all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name,
                                       count=200,
                                       max_id=oldest)

        # save most recent tweets
        alltweets.extend(new_tweets)

        # update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

    print("...{} tweets downloaded so far".format((len(alltweets))))
    #print('all tweets\n', alltweets)
    #print('first tweet:', alltweets[0])
    # transform the tweepy tweets into a 2D array that will populate the csv
    outtweets = [[
        tweet.id_str,
        tweet.created_at.strftime('%m/%d/%Y'),
        tweet.text.encode("utf-8").decode('utf-8')
    ] for tweet in alltweets]
    outtweetsDict = [{
        'id': tweet.id_str,
        'created_at': tweet.created_at.strftime('%m/%d/%Y'),
        'text': tweet.text.encode("utf-8").decode('utf-8')
    } for tweet in alltweets]
    #print('first outtweets:', outtweets[0])

    # write the csv
    with open('%s_tweets.csv' % screen_name, 'w') as csvfile:
        fieldnames = ["id", "created_at", "text"]
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                dialect='excel')
        writer.writeheader()
        writer.writerows(outtweetsDict)

    workbook = xlsxwriter.Workbook('%s_tweets.xlsx' % screen_name)
    worksheet = workbook.add_worksheet()
    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0
    worksheet.write(row, 0, 'id')
    worksheet.write(row, 1, 'created_at')
    worksheet.write(row, 2, 'original_text')
    worksheet.write(row, 3, 'clean_text')
    row += 1
    for tid, tdate, text in outtweets:
        clean_text = tweet_cleaner.clean_tweet(text)
        clean_text = tweet_cleaner.normalize_arabic(clean_text)
        clean_text = tweet_cleaner.remove_repeating_char(clean_text)
        clean_text = tweet_cleaner.keep_only_arabic(clean_text.split())
        worksheet.write(row, col, tid)
        worksheet.write(row, col + 1, tdate)
        worksheet.write(row, col + 2, text)
        worksheet.write(row, col + 3, clean_text)
        row += 1
    workbook.close()
Beispiel #11
0
data_pd = pd.read_csv(
    'data/tweets.csv',
    sep=',',
    skipinitialspace=True,
)

data_set = pd.DataFrame(data_pd)
col = data_pd['tweet_text']

# prepare an empty list
tweet_list = list()

tweets = data_set['tweet_text'].values.tolist()

for tweet in tweets[0:40000]:
    words = tweet_cleaner.clean_tweet(tweet)

    count = 0
    tweet_vector = []
    for w in words:
        if w in model:
            if count == 0:
                tweet_vector = model[w]
            else:
                tweet_vector = tweet_vector + model[w]
            count = count + 1

    if len(tweet_vector) > 0:
        document_vector_list.append(tweet_vector)

document_vector_array = np.array(document_vector_list)