forked from robopt/Capstone
/
crawler.py
164 lines (129 loc) · 4.34 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Crawler
# Class / Template for all crawlers to inherit
# Edward Mead
from crawlerconfig import CrawlerConfig
from result import Result
import socket
import jsonpickle
class Crawler:
name = "Default Crawler"
def __init__(self, config):
self.config = config
self.result = Result(config)
# String
def __str__(self):
return self.name
# Initiate a crawl using a specific crawl config or the one used when initialized
def do_Crawl(self, config=None):
if (config == None):
config = self.config
print "Not implemented exception. Do not use this class directly. It should be inherited."
raise
# Get - Crawl Config object
def get_crawl_config(self):
return self.config
# Get - Crawl location
def get_crawl_location(self):
return self.config.location
# Get - Crawl protocol
def get_crawl_protocol(self):
return self.config.protocol
# Get - Crawl speed
def get_crawl_speed(self):
return self.config.speed
# Get - Crawl max depth
def get_crawl_maxDepth(self):
return self.config.maxDepth
# Get - Crawl name
def get_crawl_name(self):
return self.config.name
# Get - Crawl depth
def get_crawl_depth(self):
return self.config.depth
# Inc - Crawl Depth + 1
def inc_crawl_depth(self):
self.config.depth += 1
# Set - Crawl speed
def set_crawl_speed(self, speed):
self.config.speed = speed
# Set - Crawl max depth
def set_crawl_maxDepth(self, maxDepth):
self.config.maxDepth = maxDepth
# Set - Crawl name
def set_crawl_name(self, name):
self.config.name = name
# Set - Crawl depth
def set_crawl_depth(self, depth):
self.config.depth = depth
# Get - Crawl Option
def get_crawl_options(self):
return self.config.options
# Add - Crawl Option
# Option = key value pair <"Option", True/False>
def add_crawl_option(self, option, value):
self.config.options[option] = value
# Get - Result object
def get_result(self):
return self.result
# Get - Result data hash
# Get the current hash of the data
def get_result_dataHash(self):
return self.result.get_dataHash()
# Get - Result source
def get_result_source(self):
return self.result.get_source()
# Set - Result source
def set_result_source(self, source):
self.result.set_source(source)
# Get - Result data
def get_result_data(self):
return self.result.get_data()
# Set - Result data
def set_result_data(self, data):
self.result.set_data(data)
# Add - Result data
def add_result_data(self, data):
self.result.add_data(data)
# Get - Result time start
def get_result_timeStart(self):
return self.result.get_timeStart()
# Set - Result time start
def set_result_timeStart(self, timeStart):
self.result.set_timeStart(timeStart)
# Get - Result time end
def get_result_timeEnd(self):
return self.result.get_timeEnd()
# Set - Result time end
def set_result_timeEnd(self, timeEnd):
self.result.set_timeEnd(timeEnd)
# Get - Result Crawler Configuration
def get_result_crawlerConfig(self):
return self.result.get_crawlerConfig()
# Set - Result Crawler Configuration
def set_result_crawlerConfig(self, crawlerConfig):
self.result.set_crawlerConfig(crawlerConfig)
# Get - Result Referrer
def get_result_referrer(self):
return self.result.get_referrer()
# Set - Result Referrer
def set_result_referrer(self, referrer):
self.result.set_referrer(referrer)
def send_result(self, result):
#send results to parser
# @todo test when there is a destination to send data to
# @todo later goal implement ssl?
conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
hostname = "parser"
port = 443
try:
ser = jsonpickle.encode(result)
except:
print("Encode failed..." + str(result))
return
try:
conn.connect((hostname, port))
conn.sendall(ser.encode('utf-8'))
print("Sent all data.")
except Exception as e:
print("Error sending data/connecting. Error: " + str(e))
return