forked from webfp/tor-browser-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
116 lines (95 loc) · 4.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import traceback
from logging import INFO, DEBUG
from os import stat
from os.path import isfile
from sys import maxsize, argv
import common as cm
import utils as ut
from log import wl_log
from tbcrawler.crawler import Crawler
def main():
# Parse arguments
args = parse_arguments()
# Read URLs
url_list = read_list_urls(args)
# Get torrc matching experiment type
torrc_dict = cm.TORRC_BY_TYPE[args.experiment]
# Instantiate crawler
crawler = Crawler(url_list, torrc_dict,
output=args.output,
experiment=args.experiment,
xvfb=args.xvfb,
capture_screen=True)
# Run the crawl
try:
crawler.crawl(args.batches, args.instances,
start_line=args.start_line - 1)
except KeyboardInterrupt:
wl_log.warning("Keyboard interrupt! Quitting...")
except Exception as e:
wl_log.error("Exception: \n%s" % (traceback.format_exc()))
finally:
crawler.stop_crawl()
def read_list_urls(file_path):
"""Return list of urls from a file."""
assert (isfile(file_path.url_list_path)) # check that file exists
assert (not stat(file_path.url_list_path).st_size == 0) # check that file is not empty
url_list = []
try:
with open(file_path.url_list_path) as f:
file_contents = f.read()
url_list = file_contents.splitlines()
url_list = url_list[file_path.start_line - 1:file_path.stop_line]
except Exception as e:
ut.die("ERROR: while parsing URL list: {} \n{}".format(e, traceback.format_exc()))
return url_list
def parse_arguments():
# Parse arguments
parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')
# List of urls to be crawled
parser.add_argument('-u', '--url-list', required=True,
help='Path to the fail that contains the list of URLs to crawl.')
parser.add_argument('-o', '--output',
help='Directory to dump the results (default=./results).',
default=cm.RESULTS_DIR)
parser.add_argument('-b', '--tbb-path',
help="Path to the Tor Browser Bundle directory.",
default=cm.TBB_PATH)
parser.add_argument("-e", "--experiment", choices=cm.EXP_TYPES,
help="Specifies the crawling methodology.",
default=cm.EXP_TYPE_WANG_AND_GOLDBERG)
parser.add_argument('-v', '--verbose', action='store_true',
help='increase output verbosity',
default=False)
# For understanding batch and instance parameters please refer
# to Wang and Goldberg's WPES'13 paper, Section 4.1.4
parser.add_argument('--batches', type=int,
help='Number of batches in the crawl (default: %s)' % cm.NUM_BATCHES,
default=cm.NUM_BATCHES)
parser.add_argument('--instances', type=int,
help='Number of instances to crawl for each web page (default: %s)' % cm.NUM_INSTANCES,
default=cm.NUM_INSTANCES)
# Crawler features
parser.add_argument('-x', '--xvfb', action='store_true',
help='Use XVFB (for headless testing)',
default=False)
parser.add_argument('-c', '--capture-screen', action='store_true',
help='Capture page screenshots',
default=False)
# Limit crawl
parser.add_argument('--start', type=int,
help='Start crawling URLs from this line number: (default: 1).',
default=1)
parser.add_argument('--stop', type=int,
help='Stop crawling URLs after this line number: (default: EOF).',
default=maxsize)
# Parse arguments
args = parser.parse_args()
# Set verbose level
wl_log.setLevel(DEBUG if args.verbose else INFO)
del args.verbose
wl_log.debug("Command line parameters: %s" % argv)
return args
if __name__ == '__main__':
main()