-
Notifications
You must be signed in to change notification settings - Fork 0
/
autoscrapyv6.py
executable file
·245 lines (222 loc) · 9.38 KB
/
autoscrapyv6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python
from scrapy import log
from scrapy import signals
from subprocess import call
import os
from time import gmtime, strftime
from twisted.internet import reactor, defer
from scrapy.crawler import Crawler
from scrapy.utils.project import get_project_settings
# Spiders
from Bookies.spiders.sport888_spider import sport888Spider
from Bookies.spiders.Apollobet_spider import ApollobetSpider
from Bookies.spiders.Apostasonline import ApostasonlineSpider
from Bookies.spiders.bet188_spider import Bet188Spider
from Bookies.spiders.Bet3000_spider import Bet3000Spider
from Bookies.spiders.Betathome import BetathomeSpider
from Bookies.spiders.Betfred_spider import BetfredSpider
from Bookies.spiders.Betinternet_spider import BetinternetSpider
from Bookies.spiders.Betsafe_spider import BetsafeSpider
from Bookies.spiders.Betsson_spider import BetssonSpider
from Bookies.spiders.Betvictor_spider import BetvictorSpider
from Bookies.spiders.Betway_spider import BetwaySpider
from Bookies.spiders.BGbet_spider import BGbetSpider
from Bookies.spiders.Buzzodds_spider import BuzzoddsSpider
from Bookies.spiders.Bwin_spider import BwinSpider
from Bookies.spiders.Coral_spider import CoralSpider
from Bookies.spiders.Dhoze_spider import DhozeSpider
from Bookies.spiders.Doxxbet_spider import DoxxbetSpider
from Bookies.spiders.Fortunawin_spider import FortunawinSpider
from Bookies.spiders.Gentingbet_spider import GentingbetSpider
from Bookies.spiders.Interwetten_spider import InterwettenSpider
from Bookies.spiders.Ladbrokes_spider import LadbrokesSpider
from Bookies.spiders.Marathonbet_spider import MarathonbetSpider
from Bookies.spiders.Nordicbet_spider import NordicbetSpider
from Bookies.spiders.Oddsring import OddsringSpider
from Bookies.spiders.OneVice_spider import OneviceSpider
from Bookies.spiders.Paddypower_spider import PaddypowerSpider
from Bookies.spiders.Setantabet_spider import SetantabetSpider
from Bookies.spiders.Skybet_spider import SkybetSpider
from Bookies.spiders.Sportingbet_spider import SportingbetSpider
from Bookies.spiders.Sportium_spider import SportiumSpider
from Bookies.spiders.Stanjames_spider import StanjamesSpider
from Bookies.spiders.Titanbet_spider import TitanbetSpider
from Bookies.spiders.Tonybet import TonybetSpider
from Bookies.spiders.Totesport_spider import TotesportSpider
from Bookies.spiders.Whitebet import WhitebetSpider
from Bookies.spiders.Williamhill_spider import WilliamhillSpider
# Map spider names to spider class instances
spiderDict = {'sport888': sport888Spider,
'Apollobet': ApollobetSpider,
'Apostasonline': ApostasonlineSpider,
'Bet188': Bet188Spider,
'Bet3000': Bet3000Spider,
'Betathome': BetathomeSpider,
'Betfred': BetfredSpider,
'Betinternet': BetinternetSpider,
'Betsafe': BetsafeSpider,
'Betsson': BetssonSpider,
'Betvictor': BetvictorSpider,
'Betway': BetwaySpider,
'BGbet': BGbetSpider,
'Buzzodds': BuzzoddsSpider,
'Bwin': BwinSpider,
'Coral': CoralSpider,
'Dhoze': DhozeSpider,
'Doxxbet': DoxxbetSpider,
'Fortunawin': FortunawinSpider,
'Gentingbet': GentingbetSpider,
'Interwetten': InterwettenSpider,
'Ladbrokes': LadbrokesSpider,
'Marathonbet': MarathonbetSpider,
'Nordicbet': NordicbetSpider,
'Oddsring': OddsringSpider,
'Onevice': OneviceSpider,
'Paddypower': PaddypowerSpider,
'Setantabet': SetantabetSpider,
'Skybet': SkybetSpider,
'Sportingbet': SportingbetSpider,
'Sportium': SportiumSpider,
'Stanjames': StanjamesSpider,
'Titanbet': TitanbetSpider,
'Tonybet': TonybetSpider,
'Totesport': TotesportSpider,
'Whitebet': WhitebetSpider,
'Williamhill': WilliamhillSpider,
}
# ======================LOGGING (python logging not scrapy logging)
import logging
LOG_LEVEL = logging.DEBUG
LOGFORMAT = " %(log_color)s%(levelname)-8s%(reset)s | %(log_color)s%(message)s%(reset)s"
from colorlog import ColoredFormatter
logging.root.setLevel(LOG_LEVEL)
formatter = ColoredFormatter(LOGFORMAT)
stream = logging.StreamHandler()
stream.setLevel(LOG_LEVEL)
stream.setFormatter(formatter)
log2 = logging.getLogger('pythonConfig')
log2.setLevel(LOG_LEVEL)
log2.addHandler(stream)
# ======================MONGO
# Need mongo so we can delete old db for given bookie
from pymongo import MongoClient
client = MongoClient()
client = MongoClient('localhost', 27017)
db = client.oddsbot_scrapy # connect to our db
events = db.events # then events collection
xevents = db.xevents # then xevents collection
# =============POSTGRES (store arbs in postgres)
# Need pg so we can delete old arbs for given bookies
import psycopg2
try:
conn = psycopg2.connect("dbname=arbs user=oddsbot password='oddsbot' host=localhost")
except psycopg2.OperationalError as e:
log2.error(e)
exit()
# Scrape exchanges
def xrunner():
'''
Run the data gathering for exchanges.
'''
# change cwd to scripts
orig_dir = os.getcwd()
xpath = os.path.join(orig_dir, 'Exchanges')
os.chdir(xpath)
log2.info('Removing previous xevents in db')
xevents.drop()
# Exchange scripts take care of clearing own mongo entries
# Betfair
r1 = call(["./BFscrapev3.py", ])
# Betdaq
r2 = call(["./DAQxml.py", ])
# Smarkets
r3 = call(["./Smarketsxml.py", ])
# Back to original dir
os.chdir(orig_dir)
return (r1, r2, r3)
# The logging is a real annoyance as although crawler arg(crawler=crawler) can bring
# back `log_count` stats for a given crawler, we can only start the log once
# without problems.
# Is there an `exception_count` key instead that could be used with stats
# mailer? or another way to get something like this functionality back?
from scrapy.conf import settings
log.start(logfile=settings['LOG_FILE'])
def setup_crawler(spider, stop=False):
'''
Takes a spider class object
'''
# Deferred means other functions can wait on this finishing
# Wait until the callback is triggered by spider close
# See twisted docs
d = defer.Deferred()
def foo(*a, **kw):
# The result to be passed to any callbacks to deferred
# (we don't use it, so True could've been False, None w/e)
d.callback(True)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
# Ref to foo otherwise it gets GC'd (garbage collected)
crawler._tempref = foo
# foo is the handler for the closed signal from this spider
# N.B. dispatch returns spider and reason (e.g. 'finished') to foo.
crawler.signals.connect(foo, signal=signals.spider_closed)
crawler.crawl(spider)
# N.B log is scrapy log. log2 is python color logger
# The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats
# which you will want for stats mailer extension.
# Starting this each time will cause the big torrade of ESMTP Error
# log.start(crawler=crawler)
crawler.start()
return d
def processBatch(spiderNames):
dlist = []
# Setup the spiders for this batch
for spiderName in spiderNames:
# log2.info('Seting up crawler for bookie %s' % spiderName)
d = setup_crawler(spiderDict[spiderName]())
dlist.append(d)
# Deferred list means things will wait until every element finished before
# cont.
return defer.DeferredList(dlist)
# 'Betathome'
# 'Betvictor'
spiderNames = [# ['sport888', 'Apollobet', 'Apostasonline', 'Betinternet',
# 'Betsafe', 'Bwin', 'Dhoze', ],
# ['BGbet', 'Buzzodds', 'Coral', 'Doxxbet', 'Interwetten',
# 'Marathonbet', 'Titanbet', 'Whitebet', ],
# ['Bet3000', 'Betsson', 'Fortunawin', 'Gentingbet', 'Nordicbet', 'Oddsring',
# 'Paddypower', 'Skybet', 'Sportingbet', 'Sportium', 'Tonybet', 'Totesport', ],
['Bet188', 'Betfred', 'Betway', 'Ladbrokes', 'Onevice', 'Setantabet',
'Stanjames', 'Williamhill', ],
]
# defer.inlineCallbacks uses deferred behind scenes, allowing you to use yield
# syntax to wait on a deferred
@defer.inlineCallbacks
def startSpiders():
for group in spiderNames:
log2.info('Time is: %s' % strftime("%H:%M:%S", gmtime()))
log2.info('\n\nStarting group: %s\n\n' % (group, ))
log2.info('Removing previous events in db for %s' % group)
events.remove({'bookieName': {'$in': spiderNames}})
yield processBatch(group)
log2.info('Time is: %s' % strftime("%H:%M:%S", gmtime()))
log.msg('\n\nEnded group: %s\n\n' % (group,))
# do something else now that the batch is done.
# ...
# Scrape exchanges
results = xrunner()
log2.info('Hunt for some arbs...')
# Finally search for some juicy arbs
# Delete arbs in db for these bookies
with conn:
cur = conn.cursor()
try:
cur.execute('DELETE FROM arbs_tab WHERE bookie_name in %s' % str(tuple(group)))
except psycopg2.ProgrammingError:
# table doesn't exist
pass
res = call(['./findarbs.py', '--books']+ group)
reactor.stop()
reactor.callWhenRunning(startSpiders)
reactor.run()