def build_vocabularies(self, rows: RDD): """ Process rows to gather values and paths with their frequencies. :param rows: row structure is ((key, doc), val) where: * key: str with the path context * doc: file name * val: number of occurrences of key in doc """ def _flatten_row(row: Row): # 2: removes the namespace v. from the string to parse it as tuple k = Vocabulary2Id._unstringify_path_context(row) return [(k[0], 1), (k[1], 1), (k[2], 1)] rows = rows \ .flatMap(_flatten_row) \ .reduceByKey(operator.add) \ .persist() values = rows.filter(lambda x: type(x[0]) == str).collect() paths = rows.filter(lambda x: type(x[0]) == tuple).collect() value2index = {w: id for id, (w, _) in enumerate(values)} path2index = {w: id for id, (w, _) in enumerate(paths)} value2freq = {w: freq for _, (w, freq) in enumerate(values)} path2freq = {w: freq for _, (w, freq) in enumerate(paths)} rows.unpersist() return value2index, path2index, value2freq, path2freq
def analyze(rddDns: RDD) -> Dict[str, Result]: # filter out trustedDNS log = getLogger() premiseCheck_ = functools.partial(premiseCheck, Global.ALLOWED_NAME_LEN, Global.RESTRICTED_SYMS, Global.MAX_BODY_SIZE, Global.MIN_TTL) timer = Timer() # cache bcs only this rdd will be used in the application ipPartGen = rddDns.filter(compose(operator.not_, premiseCheck_)).map(lambda dns: str(dns.sip)).distinct().glom().toLocalIterator() log.info(f'Time spent on premis analysis = {timer.elapsed()}') # log.debug(ips) timer = Timer() ipdoms = {} # REFACTOR THIS STIH for ipPart in ipPartGen: for ip in set(ipPart): if ip not in ipdoms: log.debug(ip) ipdoms[ip] = np.array( rddDns.filter( lambda dns: ip in [dns.dip, dns.sip]).map( lambda dns: parseDomain(str(dns.getName()))).distinct().collect()) log.debug(ipdoms.get(ip)) log.info(f'Time spent on searching packets for chosen IPs = {timer.elapsed()}') timer = Timer() result = [] for ip, doms in ipdoms.items(): result.append((str(ip), repr(unigramAnalysis(doms)))) log.info(f'Time spent on unigram distribution analysis = {timer.elapsed()}') rddDns.unpersist() return dict(result)