-
Notifications
You must be signed in to change notification settings - Fork 0
/
miner.py
120 lines (95 loc) · 3.73 KB
/
miner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import logging
from requests.exceptions import Timeout
import thread
from datetime import datetime
from threading import Thread
from time import sleep
from random import randint
from helpers.EntryParser import EntryParser
from helpers.SubredditParser import SubredditParser
from helpers.db import setup_db
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
filename='miner.log',level=logging.DEBUG)
def mine(db, mined_from=None, entry_count=200):
subreddit_parser = SubredditParser(mined_from)
entry_parser = EntryParser()
ids = set([])
last_id = ""
step_size = entry_count / 10
count = 0
accepted = 0
retries = 0
while accepted < entry_count:
entries = None
while entries is None:
try:
entries = subreddit_parser.parse_entries(step_size, last_id)
except Exception, error:
retries += 1
if retries < 3:
logging.error('Timeout: %s %s %s' % (mined_from, count, error))
sleep(randint(10, 20))
elif retries < 8:
logging.error('Timeout: %s %s %s' % (mined_from, count, error))
sleep(randint(10, 20))
continue
else:
thread.exit()
unchanged = False
skipped = False
for i, entry in enumerate(entries):
if entry['reddit_id'] in ids:
unchanged = True
logging.info('Unchanged: entries %d-%d in %s' % (i, count + len(entries) - 1, mined_from))
break
last_id = entry['reddit_id']
ids.add(last_id)
saved_entry = db['entries'].find_one(reddit_id=entry['reddit_id'])
if saved_entry is None:
entry['article'] = None
while entry['article'] is None:
try:
entry['article'] = entry_parser.get_content(entry['link'])
db['entries'].insert(entry)
accepted += 1
except Exception, error:
retries += 1
if retries < 3:
logging.error('Error: %s %s %s' % (mined_from, count, error))
sleep(randint(10, 20))
elif retries < 8:
logging.error('Error: %s %s %s' % (mined_from, count, error))
sleep(randint(10, 20))
continue
else:
thread.exit()
else:
skipped = True
logging.info('Skipped: %d-%d in %s' % (i, count + len(entries) - 1, mined_from))
break
sleep(randint(1, 3))
if unchanged:
break
elif not skipped:
logging.info('Finished: entries %d-%d in %s' % (count, count + len(entries) - 1, mined_from))
count += len(entries)
sleep(randint(5, 10))
def main():
threads = []
db = setup_db()
with open('subreddits.txt') as f:
SUBREDDITS = f.read().split('\n')
for index in xrange(0, len(SUBREDDITS), 4):
for x in xrange(index, index + 4):
if x >= len(SUBREDDITS):
break
thread = Thread(target=mine, args=(db,), kwargs={ "mined_from": SUBREDDITS[x] })
thread.start()
threads.append(thread)
sleep(randint(3, 5))
for i, thread in enumerate(threads):
logging.info('Finished thread: %s' % SUBREDDITS[index + i])
thread.join()
threads = []
if __name__ == "__main__":
main()