forked from singhj/locality-sensitive-hashing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
peer_belt_driver.py
146 lines (118 loc) · 5.16 KB
/
peer_belt_driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import webapp2, re, zipfile
import jinja2
from google.appengine.api import users
from bs4 import BeautifulSoup
import logging
from repositories.gae.blobstore import get_all_blob_info
from repositories.gae.blob_dataset import BlobDataset
from repositories.gae.blobstore import create_upload_url, get_blob_key
from pipelines.gae.lsh_pipelines import LshBlobPipeline
from pipelines.gae.map_reduce_pipeline_factory import MapReducePipelineFactory
from lsh_map_reduce.lsh_map_base import LshMapBase
from lsh_map_reduce.lsh_reduce_base import LshReduceBase
from utils.zip_utils import all_matching_files
symbols = re.compile('\W+')
text_file_pattern = re.compile('^{"id":"([^"]*):html","text":"(.*)}', flags=re.DOTALL)
def parse_text(text):
soup = BeautifulSoup(text.replace('\\n',' '))
[s.extract() for s in soup(['script', 'style'])]
text = soup.get_text(separator=' ', strip=True)
text = symbols.sub(' ', text.lower())
# Remove spurious white space characters
text = ' '.join(text.split())
return text
class MainHandler(webapp2.RequestHandler):
template_env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"),
autoescape=True)
def get(self):
logging.info('Peer Belt Driver get method called!')
user = users.get_current_user()
username = user.nickname()
results = BlobDataset.query()
items = [result for result in results]
for item in items:
item.ds_key = item.key.urlsafe()
length = len(items)
upload_url = create_upload_url("upload_blob2")
self.response.out.write(self.template_env.get_template("blobs2.html").render(
{"username": username,
"items": items,
"length": length,
"upload_url": upload_url}))
def post(self):
filename = self.request.get("filename")
blob_key = self.request.get("blobkey")
ds_key = self.request.get("ds_key")
map_reduce_pipeline_dict = self.get_pipeline(blob_key)
logging.info('filename %s key %s', filename, blob_key)
logging.info(map_reduce_pipeline_dict)
pipeline = LshBlobPipeline(filename, blob_key, ds_key, map_reduce_pipeline_dict)
pipeline.start()
self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
def get_pipeline(self, blob_key):
return MapReducePipelineFactory("locality_sensitive_hashing",
"peer_belt_driver.map",
"peer_belt_driver.reduce",
'mapreduce.input_readers.BlobstoreZipLineInputReader',
"mapreduce.output_writers.BlobstoreOutputWriter",
mapper_params={
"blob_keys": blob_key,
},
reducer_params={
"mime_type": "text/plain",
},
shards=16).create2()
class PeerLshMap(LshMapBase):
@classmethod
def pre_process(cls, data):
(blob_key, file_no, line) = (data[0][0], data[0][1], data[1])
found_pattern = text_file_pattern.search(line)
if not found_pattern:
return
(_id, text) = (found_pattern.group(1), found_pattern.group(2))
dataset = BlobDataset.query(BlobDataset.blob_key == get_blob_key(blob_key)).get()
text = parse_text(text)
return (dataset, _id, text)
@classmethod
def parsed_text(cls, text):
soup = BeautifulSoup(text.replace('\\n',' '))
[s.extract() for s in soup(['script', 'style'])]
text = soup.get_text(separator=' ', strip=True)
text = symbols.sub(' ', text.lower())
# Remove spurious white space characters
text = ' '.join(text.split())
return text
class PeerLshReduce(LshReduceBase):
@classmethod
def reduce(cls, key, values):
yield (key, values)
#our wrapper functions that call the PeerBelt specific map and reduce functions
def map(data):
PeerLshMap.map(data)
def reduce(key, values):
PeerLshReduce.reduce(key, values)
class ViewHandler(webapp2.RequestHandler):
def get(self, dataset_name, file_id):
def cleanup(text):
return text.replace('\\n', ' ')
for blob_info, blob_reader in get_all_blob_info():
if blob_info.filename == dataset_name:
zip_reader = zipfile.ZipFile(blob_reader)
for member in zip_reader.namelist():
for lno, mno, _id, text in all_matching_files(zip_reader, member, text_file_pattern):
if file_id == _id:
self.response.out.write(cleanup(text))
return
message = 'ID %s not found' % file_id
self.response.out.write('<html><body><p>%s</p></body></html>' % message)
return
message = 'Blob %s not found' % dataset_name
self.response.out.write('<html><body><p>%s</p></body></html>' % message)
return
from handlers.gae.upload_handler import UploadHandler
from handlers.gae.server_handler import ServeHandler
urls = [('/blobs2', MainHandler),
('/upload_blob2', UploadHandler),
('/serve_blob2/([^/]+)?', ServeHandler),
('/view2/([^/]+)?/([^/]+)?', ViewHandler),
]