forked from mgax/hambar109
/
harvest.py
128 lines (103 loc) · 3.45 KB
/
harvest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import sys
import re
import logging
import tempfile
from path import path
import requests
from celery import Celery
from celery.signals import setup_logging
import flask
MOF_URL = 'http://kurtyan.org/MOF/'
celery = Celery()
celery.config_from_object('celeryconfig')
@setup_logging.connect
def configure_worker(sender=None, **extra):
from utils import set_up_logging
set_up_logging()
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
_links_pattern = re.compile(r'<a href="([^"]+)"')
def ignore(link):
if link.startswith('/'):
return True
if link in ['/', '_htaccess.txt', '.DS_Store']:
return True
return False
def links(html):
offset = 0
while True:
m = _links_pattern.search(html, offset)
if m is None:
break
offset = m.end(0)
link = m.group(1)
if not ignore(link):
yield link
def build_fs_path(file_path):
return flask.current_app.config['PUBDOCS_FILE_REPO'] / file_path
def register_commands(manager):
@manager.command
def download_index():
""" Generate index of MOF downloadable files. """
resp = requests.get(MOF_URL)
for link in links(resp.text):
print>>sys.stderr, '>', link
resp2 = requests.get(MOF_URL + link)
for link2 in links(resp2.text):
print>>sys.stderr, '>>>', link2
resp3 = requests.get(MOF_URL + link + link2)
for link3 in links(resp3.text):
print>>sys.stderr, '>>>>>', link3
print MOF_URL + link + link2 + link3
print>>sys.stderr, len(MOF_URL)
@manager.command
def download(file_path):
""" Download a file right now. """
download_mof(file_path)
@manager.command
def schedule_downloads(limit='100'):
""" Schedule files for download by workers. """
links = path(os.environ['PUBDOCS_LINKS']).text().strip().split()
scheduled = 0
skipped = 0
for url in links:
assert url.startswith(MOF_URL)
file_path = url[len(MOF_URL):]
fs_path = build_fs_path(file_path)
if fs_path.isfile():
skipped += 1
continue
download_mof.delay(file_path)
scheduled += 1
if scheduled >= int(limit):
break
log.info('Scheduled %d downloads, skipped %d', scheduled, skipped)
def appcontext(func):
def wrapper(*args, **kwargs):
import manage
app = manage.create_app()
with app.app_context():
return func(*args, **kwargs)
return wrapper
@celery.task
@appcontext
def download_mof(file_path, overwrite=False):
url = MOF_URL + file_path
fs_path = build_fs_path(file_path)
if fs_path.isfile():
log.info("File %r already exists, skipping", str(fs_path))
return
fs_path.parent.makedirs_p()
resp = requests.get(url, prefetch=False)
if resp.status_code != 200:
log.error("Request error at %r: %r", url, resp)
tmp = tempfile.NamedTemporaryFile(dir=fs_path.parent,
delete=False,
prefix=fs_path.name + '-',
suffix='.tmp')
with tmp:
for block in resp.iter_content(65536):
tmp.write(block)
path(tmp.name).rename(fs_path)
log.info("Downloaded %r (%d)", file_path, fs_path.stat().st_size)