def add_edge(self, src_sid, dst_sid, label): self.total_edges += 1 src_id = self.session.query(EdgeLookup.id).filter_by( oid=src_sid).filter(EdgeLookup.ad_id == self.ad_id).first() if src_id is None: #this should not happen t = EdgeLookup(self.ad_id, src_sid, 'unknown') self.session.add(t) self.session.commit() self.session.refresh(t) src_id = t.id else: src_id = src_id[0] dst_id = self.session.query(EdgeLookup.id).filter_by( oid=dst_sid).filter(EdgeLookup.ad_id == self.ad_id).first() if dst_id is None: #this should not happen t = EdgeLookup(self.ad_id, dst_sid, 'unknown') self.session.add(t) self.session.commit() self.session.refresh(t) dst_id = t.id else: dst_id = dst_id[0] edge = Edge(self.ad_id, src_id, dst_id, label) self.session.add(edge)
def add_edge(self, src_sid, dst_sid, label, with_boost = False): self.total_edges += 1 src_id = self.get_id_for_sid(src_sid, with_boost = with_boost) dst_id = self.get_id_for_sid(dst_sid, with_boost = with_boost) edge = Edge(self.ad_id, self.graph_id, src_id, dst_id, label) self.session.add(edge) if self.total_edges % 10000 == 0: self.session.commit()
async def calc_sds_mp(self): await self.log_msg('Calculating SD edges') logger.debug('starting calc_sds_mp') try: cnt = 0 total = self.session.query(func.count( JackDawSD.id)).filter(JackDawSD.ad_id == self.ad_id).scalar() logger.debug('calc_sds_mp total SDs %s' % str(total)) q = self.session.query(JackDawSD).filter_by(ad_id=self.ad_id) if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.SDCALC msg.msg_type = MSGTYPE.STARTED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) sdcalc_pbar = None if self.show_progress is True: sdcalc_pbar = tqdm(desc='Writing SD edges to file', total=total) testfile = tempfile.TemporaryFile('w+', newline='') buffer = [] if self.mp_pool is None: self.mp_pool = mp.Pool() logger.debug('calc_sds_mp starting calc') tf = 0 last_stat_cnt = 0 try: for adsd in windowed_query(q, JackDawSD.id, self.buffer_size): tf += 1 adsd = JackDawSD.from_dict(adsd.to_dict()) buffer.append(adsd) if len(buffer) == self.buffer_size: self.calc_sds_batch(buffer, testfile) buffer = [] if sdcalc_pbar is not None: sdcalc_pbar.update(self.buffer_size) if self.progress_queue is not None and tf % self.progress_step_size == 0: last_stat_cnt += self.progress_step_size now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.SDCALC msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = total msg.total_finished = tf if td > 0: msg.speed = str(self.progress_step_size // td) msg.step_size = self.progress_step_size await self.progress_queue.put(msg) await asyncio.sleep(0) if len(buffer) > 0: self.calc_sds_batch(buffer, testfile) if self.progress_queue is not None: now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.SDCALC msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = total msg.total_finished = tf if td > 0: msg.speed = str(len(buffer) // td) msg.step_size = tf - last_stat_cnt await self.progress_queue.put(msg) await asyncio.sleep(0) buffer = [] if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.SDCALC msg.msg_type = MSGTYPE.FINISHED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) if self.show_progress is True and sdcalc_pbar is not None: sdcalc_pbar.refresh() sdcalc_pbar.disable = True except Exception as e: logger.exception('SD calc exception!') raise e finally: if self.foreign_pool is False: self.mp_pool.close() if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.SDCALCUPLOAD msg.msg_type = MSGTYPE.STARTED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) logger.debug('Writing SD edge file contents to DB') await self.log_msg('Writing SD edge file contents to DB') sdcalcupload_pbar = None if self.show_progress is True: sdcalcupload_pbar = tqdm( desc='Writing SD edge file contents to DB', total=cnt) testfile.seek(0, 0) last_stat_cnt = 0 i = 0 for line in testfile: i += 1 line = line.strip() src_id, dst_id, label, _ = line.split(',') edge = Edge(self.ad_id, self.graph_id, src_id, dst_id, label) self.session.add(edge) if i % (self.buffer_size * 100) == 0: self.session.commit() if self.show_progress is True: sdcalcupload_pbar.update() if self.progress_queue is not None and i % self.progress_step_size == 0: last_stat_cnt += self.progress_step_size now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.SDCALCUPLOAD msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = self.sd_edges_written msg.total_finished = i if td > 0: msg.speed = str(self.progress_step_size // td) msg.step_size = self.progress_step_size await self.progress_queue.put(msg) await asyncio.sleep(0) self.session.commit() if self.progress_queue is not None: now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.SDCALCUPLOAD msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = cnt msg.total_finished = i if td > 0: msg.speed = str((i - last_stat_cnt) // td) msg.step_size = i - last_stat_cnt await self.progress_queue.put(msg) await asyncio.sleep(0) if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.SDCALCUPLOAD msg.msg_type = MSGTYPE.FINISHED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) if self.show_progress is True and sdcalcupload_pbar is not None: sdcalcupload_pbar.refresh() sdcalcupload_pbar.disable = True return True, None except Exception as e: logger.exception('sdcalc!') return False, e
async def store_file_data(self): try: if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.MEMBERSUPLOAD msg.msg_type = MSGTYPE.STARTED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) if self.show_progress is True: self.upload_pbar = tqdm(desc='Uploading memberships to DB', total=self.member_finish_ctr) self.token_file.close() cnt = 0 last_stat_cnt = 0 with gzip.GzipFile(self.token_file_path, 'r') as f: for line in f: sd = JackDawTokenGroup.from_json(line.strip()) src_id = self.sid_to_id_lookup(sd.sid, sd.ad_id, sd.object_type) dst_id = self.sid_to_id_lookup(sd.member_sid, sd.ad_id, sd.object_type) edge = Edge(sd.ad_id, self.graph_id, src_id, dst_id, 'member') self.session.add(edge) await asyncio.sleep(0) cnt += 1 if cnt % 10000 == 0: self.session.commit() if self.show_progress is True: self.upload_pbar.update() if self.progress_queue is not None and cnt % self.progress_step_size == 0: last_stat_cnt += self.progress_step_size now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.MEMBERSUPLOAD msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = self.member_finish_ctr msg.total_finished = cnt if td > 0: msg.speed = str(self.progress_step_size // td) msg.step_size = self.progress_step_size await self.progress_queue.put(msg) if self.progress_queue is not None: now = datetime.datetime.utcnow() td = (now - self.progress_last_updated).total_seconds() self.progress_last_updated = now msg = GathererProgress() msg.type = GathererProgressType.MEMBERSUPLOAD msg.msg_type = MSGTYPE.PROGRESS msg.adid = self.ad_id msg.domain_name = self.domain_name msg.total = self.member_finish_ctr msg.total_finished = cnt if td > 0: msg.speed = str( (self.member_finish_ctr - last_stat_cnt) // td) msg.step_size = self.member_finish_ctr - last_stat_cnt await self.progress_queue.put(msg) self.session.commit() if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.MEMBERSUPLOAD msg.msg_type = MSGTYPE.FINISHED msg.adid = self.ad_id msg.domain_name = self.domain_name await self.progress_queue.put(msg) return True, None except Exception as e: logger.exception( 'Error while uploading memberships from file to DB') if self.progress_queue is not None: msg = GathererProgress() msg.type = GathererProgressType.MEMBERSUPLOAD msg.msg_type = MSGTYPE.ERROR msg.adid = self.ad_id msg.domain_name = self.domain_name msg.error = e await self.progress_queue.put(msg) return None, e finally: if self.token_file_path is not None: try: os.remove(self.token_file_path) except: pass