-
Notifications
You must be signed in to change notification settings - Fork 0
/
generic_crawler.py
82 lines (66 loc) · 2.46 KB
/
generic_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# encoding: utf-8
"""
generic_crawler.py
Created by Brian Eoff on 2011-02-21.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""
import sys
import os
import codecs
import time
import logging
import traceback
import simplejson as json
from datetime import datetime
from basestream import Stream
from ConfigParser import SafeConfigParser
from stream_functions import OutputFileHandler
from stream_functions import on_receive, date_to_fname_string
if __name__ == '__main__':
"""
This type of crawler will be a generic. It does not need to interact
with Twitter beyond being connected to the stream.
"""
config_file = sys.argv[1]
crawler_config = sys.argv[2]
parser = SafeConfigParser()
parser.read(config_file)
dataDirectory = parser.get(crawler_config, 'directory')
if not (os.path.exists(dataDirectory)):
os.mkdir(dataDirectory)
time_per_file = parser.getint(crawler_config, 'time_length')
username = parser.get(crawler_config, 'username')
password = parser.get(crawler_config, 'password')
crawler_type = parser.get(crawler_config, 'type')
crawler_id = parser.get(crawler_config, 'id')
logging.basicConfig(filename=crawler_id + '.log', level=logging.ERROR)
stream_url = parser.get(crawler_config, 'stream_url')
trackTerms = []
if parser.has_option(crawler_config, 'terms_file'):
terms_file = open(parser.get(crawler_config, 'terms_file'), 'r')
json_string = terms_file.read()
trackTerms = json.loads(json_string)['terms']
terms_file.close()
while True:
currentDate = datetime.now()
dateStr = date_to_fname_string(currentDate)
output = codecs.open(dataDirectory + '/' + dateStr + '-Tweets.txt',
encoding='utf-8', mode='w+')
ofh = OutputFileHandler()
ofh.set(output)
try:
stream = Stream(stream_url, username, password, on_receive,
initial_params=trackTerms, filter_type=crawler_type)
stream.start()
time.sleep(time_per_file)
stream.stop()
ofh.close()
except Exception, err:
logging.error(str(datetime.now()) + ':' + str(err))
try:
stream.stop()
ofh.close()
except Exception, err:
logging.error(str(datetime.now()) + ':' + str(err))
logging.warning('Unable to close stream/file.')