def create_links_map(self, links_index, links_type): mapper = Mapper() # query scroll scroll = self.es.search(index=links_index, doc_type=links_type, scroll='10m', size=10000, body={"query": { "match_all": {} }}) scroll_size = scroll['hits']['total'] size = 0 # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: src_link = hit['_source']['SRC_LINK'] dst_link = hit['_source']['DST_LINK'] mapper.map(src_link) mapper.map(dst_link) # update scroll size scroll_size = len(scroll['hits']['hits']) size += scroll_size print "scrolled %s \n" % size # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') mapper.write(MAPPINGS_PATH)
def main(): mapper = Mapper() reducer = Reducer() arrayMap = mapper.map("Esta à Frase, fRASe tomas frase esta unica única") arrayMap2 = mapper.map("única Este cena\n. frase única") arrayMap3 = mapper.map("à frase à") mapFinal = reducer.reduce([arrayMap3]) print("Reduced: ", mapFinal)
class Worker(): def __init__(self, worker_id, host, port): self.host = host self.port = port self.worker_id = worker_id self.mapper = Mapper() self.reducer = Reducer() self.logger = logging.getLogger('worker ' + str(self.worker_id)) self.logger.debug('Worker connecting to %s:%d', self.host, self.port) def parse_msg(self, msg): msg_len = len(msg) return '0' * (MAX_N_BYTES - len(str(int(msg_len)))) + str(msg_len) + msg def proccess_msg(self, msg): # msg = self.queue_in.get() if msg['task'] == 'map_request': # logger.debug('THIS IS A MAP REQ') result = self.mapper.map(msg['value']) reply = {'task': 'map_reply', 'value': result} return reply elif msg['task'] == 'reduce_request': # logger.debug('THIS IS A REDUCE REQ') result = self.reducer.reduce(msg['value']) reply = {'task': 'reduce_reply', 'value': result} return reply elif msg['task'] == 'done': pass else: self.logger.debug('THIS IS NOT FOR ME: %s', msg['task']) def register(self): message = {'task': 'register', 'id': self.worker_id} return message async def tcp_echo_client(self, host, port, loop): self.logger.debug('Openning connection') reader, writer = await asyncio.open_connection(host, port, loop=loop ) # open connection # register to_send = self.register() # register on first time msg_json = json.dumps(to_send) parsed_msg = self.parse_msg(msg_json) self.logger.info('Sending to: %s' % host) writer.write(parsed_msg.encode()) # send message await writer.drain() while True: # receive data try: data = await reader.read(MAX_N_BYTES) except ConnectionResetError: await asyncio.sleep( 3) # give the backup coordinator time to start break if not data: await asyncio.sleep( 3) # give the backup coordinator time to start break # self.logger.info('Received (size of json str): %r ' % data.decode() ) cur_size = 0 total_size = int(data.decode()) final_str = '' while (total_size - cur_size) >= CHUNK: data = await reader.read(CHUNK) final_str = final_str + data.decode() cur_size += len(data) data = await reader.read(total_size - cur_size) final_str = final_str + data.decode() # self.logger.info('Received: %r ' % final_str['task'] ) self.logger.info('Received from: %s ' % host) to_send = self.proccess_msg( json.loads(final_str)) # process message if to_send is not None: msg_json = json.dumps(to_send) parsed_msg = self.parse_msg(msg_json) self.logger.info('Sending to: %s' % host) writer.write(parsed_msg.encode()) # send message await writer.drain() self.logger.info('Close the socket') writer.close()