This repository has been archived by the owner on Mar 31, 2018. It is now read-only.
forked from kvdg/StockMarketForecast
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessor.py
151 lines (134 loc) · 4.81 KB
/
Preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
'''
Created on 8-mei-2013
@author: Brecht Deconinck
'''
import Methods.IO as IO
from Classes import Tweet
from stemming.porter2 import stem
from datetime import datetime, timedelta
from decimal import *
def main():
#classifyTweetsCompany("AAPL")
classifyTweetsDJIA(3)
def classifyTweets(tweetFile, history, tag, sSaveFile, offset=3):
stopWords = getStopWords()
tweets = []
for line in IO.readData_by_line(tweetFile):
tweet = Tweet.Tweet()
tweet.setTweet(line)
if(tweet.containsTag("#" + tag)):
stamp = tweet.date + timedelta(days=offset)
if stamp.date() in history:
tweet.label = history[stamp.date()]
tweet.removeStopWords(stopWords)
tweets.append(tweet)
print len(tweets)
tweetFile.close()
IO.writeTweets(sSaveFile, tweets, ['label', 'trimmedMessage'])
def classifyTweetsDJIA(_offset=3):
tweetFile = open("data/scrapeDJIA.txt")
priceData = IO.readData("data/DJIA.tsv")
priceHist = priceHistory(priceData, "%b %d, %Y", 1)
classifyTweets(tweetFile, priceHist, "DJIA", "data/ClassifiedDJIA.txt", offset=_offset)
# # TODO: Read last line of classified tweets
# data = IO.readData("data/DJIA.tsv")
# history = priceHistory(data, "%b %d, %Y", 1)
# tweetFile = open("data/scrapeDJIA.txt")
# tweets = []
# for line in IO.readData_by_line(tweetFile):
# tweet = Tweet.Tweet()
# tweet.setTweet(line)
# tweet.label = history[tweet.date.date()]
# tweets.append(tweet)
# IO.writeTweets("data/ClassifiedDJIA.txt", tweets, ['label', 'trimmedMessage'])
def classifyTweetsCompany(tag, _offset=3):
tweetFile = open("data/scrapeCompanies.txt")
priceData = IO.readData("data/" + tag + ".csv", ',')
priceIter = iter(priceData)
next(priceIter)
priceHist = priceHistory(priceIter, "%Y-%m-%d", 2)
classifyTweets(tweetFile, priceHist, tag, "data/Classified" + tag + ".txt", offset=_offset)
# # TODO: Read last line of classified tweets
# data = IO.readData("data/" + tag + ".csv", ',')
# iterData = iter(data)
# next(iterData)
#
# stopWords = getStopWords()
#
# history = priceHistory(iterData, "%Y-%m-%d", 2)
# tweetFile = open("data/scrapeCompanies.txt")
# tweets = []
# for line in IO.readData_by_line(tweetFile):
# tweet = Tweet.Tweet()
# tweet.setTweet(line)
# if(tweet.containsTag("#" + tag)):
# stamp = tweet.date + timedelta(days=4)
# if stamp.date() in history:
# tweet.label = history[stamp.date()]
# tweet.removeStopWords(stopWords)
# tweets.append(tweet)
#
# tweetFile.close()
# IO.writeTweets("data/Classified" + tag + ".txt", tweets, ['label', 'trimmedMessage'])
def priceHistory(data, sDateFormat, indexValue):
prices = {}
first = True
date = datetime.today()
oldestPrintDT = date + timedelta(days=-21)
lastDate = date
for line in data:
lastDate = datetime.strptime(line[0], sDateFormat)
if first:
date = lastDate
first = False
prices[lastDate] = Decimal(line[indexValue])
priceChanges = {}
priceChange = 0
while (date > lastDate):
nextDay = date + timedelta(days=-1)
if date in prices:
while nextDay not in prices:
nextDay = nextDay + timedelta(days=-1)
if (prices[date] - prices[nextDay] >= 0):
priceChange = 1
else:
priceChange = 0
priceChanges[date.date()] = priceChange
# Print recent changes
# if (date > oldestPrintDT):
# print str(date.date()) + ": " + str(priceChange)
date = date + timedelta(days=-1)
return priceChanges
def setTweetsEmotion():
#Get Emotions
arrEmo = getEmotions()
#Analyse Tweet Emotion
data = open("data/scrapeCompanies.txt")
#Read every tweet
for line in IO.readData_by_line(data):
tweet = Tweet.Tweet()
tweet.setTweet(line)
#Check every emotion
value = 0
for emo in arrEmo:
word = emo[0]
if word in tweet.message:
#Update value by emotion
if emo[1] == "1":
value = 1
else:
value = -1
if(value != 0):
break
tweet.label = value
print tweet.label, " " ,tweet.message
def getEmotions():
arr = IO.readData("data/Emotions.txt")
for index in range(0, len(arr)):
arr[index][0] = stem(arr[index][0])
return arr
def getStopWords():
arr = IO.readData("data/StopWords.txt")
return arr
if __name__ == '__main__':
main()