def make_json(line): client_headers = line.get('client_header_names') if is_nan(client_headers): client_headers = list() server_headers = line.get('server_header_names') if is_nan(server_headers): server_headers = list() headers = list() headers.extend(filter(lambda header: not is_nan(header), client_headers)) headers.extend(filter(lambda header: not is_nan(header), server_headers)) return ','.join(filter(lambda header: len(header), headers))
def make_url(line): host = line.get('host') if is_nan(host): host = str() uri = line.get('uri') if is_nan(uri): uri = str() url = urllib.parse.urljoin(host, uri) port = int(line['id.resp_p']) if port == 80: base = 'http://%s' % line['id.resp_h'] else: base = 'http://%s:%s' % (line['id.resp_h'], line['id.resp_p']) return urllib.parse.urljoin(base, url)
def tokenize_from_tsv(tokenizer_name: str, input_path: str, output_path: str, y_index: int = 0, x_index: int = 1, y_header: str = "label", x_header: str = "text") -> None: """ Tokenizing on input_path file and saving to output_path file Args: """ tokenizer = get_tokenizer(tokenizer_name) df = pd.read_csv(input_path, header=0, sep="\t") total = len(df) print(">> Strart Tokenizing This File Like Below...") print(df.head(-10)) with open(output_path, "w", encoding="utf-8") as f1: f1.writelines(y_header + "\t" + x_header + "\n") row_iterator = df.iterrows() for index, row in tqdm(row_iterator, total=total): sentence = row[x_index] label = row[y_index] if is_nan(sentence) or is_nan(label): continue replaced = label.replace(" ", "_") sentence = sentence.replace("\n", "").strip() tokens = tokenizer(sentence) tokenized_sent = " ".join(_post_processing(tokens)) if is_nan(tokens) or tokens == "": continue f1.writelines(replaced + "\t" + tokenized_sent + "\n") f1.close()
def communicate(log_root): log_file = os.path.join(log_root, 'files.log') if not os.path.isfile(log_file): return LOG_FILE = parse(log_file) for line in LOG_FILE.context.itertuples(): if is_nan(getattr(line, 'extracted', None)): continue local_name = line.extracted dump_path = os.path.join(DUMP_PATH, local_name) if not os.path.exists(dump_path): warnings.warn(f'No such file or directory: {local_name!r}', ExtractWarning) return
def generate_log(log_name): global DATE date = time.strftime('%Y-%m-%d') if date != DATE: archive(DATE) DATE = date INFO = os.path.join(LOGS_PATH, 'info', f'{DATE}.log') log_stem = log_name log_root = os.path.join(LOGS_PATH, log_name) log_uuid = re.match( r'.*?-(?P<uuid>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', log_stem, re.IGNORECASE).group('uuid') log_file = os.path.join(log_root, 'files.log') if not os.path.isfile(log_file): return LOG_FILE = parse(log_file) LOG_CONN = parse(os.path.join(log_root, 'conn.log')) for line in LOG_FILE.context.itertuples(): if is_nan(getattr(line, 'extracted', None)): continue hosts = [ dict(tx=ipaddress.ip_address(tx), rx=ipaddress.ip_address(rx)) for (tx, rx) in zip(line.tx_hosts, line.rx_hosts) ] conns = list() is_orig = line.is_orig for conn_uid in line.conn_uids: record = next( LOG_CONN.context[lambda df: df.uid == conn_uid].iterrows())[1] # pylint: disable=cell-var-from-loop if is_orig: conn = dict( src_h=ipaddress.ip_address(record['id.orig_h']), src_p=int(record['id.orig_p']), dst_h=ipaddress.ip_address(record['id.resp_h']), dst_p=int(record['id.resp_p']), ) else: conn = dict( src_h=ipaddress.ip_address(record['id.resp_h']), src_p=int(record['id.resp_p']), dst_h=ipaddress.ip_address(record['id.orig_h']), dst_p=int(record['id.orig_p']), ) conns.append(conn) local_name = line.extracted mime_type = None dump_path = os.path.join(DUMP_PATH, local_name) if os.path.exists(dump_path): with contextlib.suppress(Exception): mime_type = magic.detect_from_filename(dump_path).mime_type # if mime_type is None or MIME_REGEX.match(mime_type) is None: # if MIME_MODE: # local_name = rename_dump(local_name, line.mime_type) # else: # if MIME_MODE or (mime_type != line.mime_type): # pylint: disable=else-if-used # local_name = rename_dump(local_name, mime_type) else: dump_path = None info = dict(timestamp=line.ts if LOG_FILE.format == 'json' else line.ts.timestamp(), log_uuid=log_uuid, log_path=log_root, log_name=log_stem, dump_path=dump_path, local_name=local_name, source_name=getattr(line, 'filename', None), hosts=hosts, conns=conns, bro_mime_type=line.mime_type, real_mime_type=mime_type, hash=dict( md5=getattr(line, 'md5', None), sha1=getattr(line, 'sha1', None), sha256=getattr(line, 'sha256', None), )) print_file(json.dumps(info, cls=IPAddressJSONEncoder), file=INFO)
def write(self, col, row, data, cell_format): if not (data is None or is_inf(data) or is_nan(data)): # nan, inf, None check self.sheet.write(col, row, data, cell_format) else: pass
def make_b64(data): if is_nan(data): return None return base64.b64encode(data.encode()).decode()
def load_business_data(data_path): patterns = {} question_full = {} df = pd.read_excel(data_path, header=0, sheet_name=u'Working Sheet đợt 1') for row in df.iterrows(): try: data = row[1] question = unicodedata.normalize('NFKC', data[2]) response = unicodedata.normalize('NFKC', data[15]) revise = data[8] if utils.is_nan(revise): if not utils.is_nan(data[3]): principal_npvp = unicodedata.normalize('NFKC', data[3]).strip() principal_npvp = map(lambda x: x.strip(), principal_npvp.split(u',')) principal_npvp = u' '.join(principal_npvp) else: principal_npvp = u'' if not utils.is_nan(data[4]): npvp = unicodedata.normalize('NFKC', data[4]).strip() npvp = map(lambda x: x.strip(), npvp.split(u',')) npvp = u' '.join(npvp) else: npvp = u'' if not utils.is_nan(data[5]): verb = unicodedata.normalize('NFKC', data[5]).strip() verb = map(lambda x: x.strip(), verb.split(u',')) verb = u' '.join(verb) else: verb = u'' if not utils.is_nan(data[6]): wh_question = unicodedata.normalize('NFKC', data[6]).strip() wh_question = map(lambda x: x.strip(), wh_question.split(u',')) wh_question = u' '.join(wh_question) else: wh_question = u'' else: if not utils.is_nan(data[9]): principal_npvp = unicodedata.normalize('NFKC', data[9]).strip() principal_npvp = map(lambda x: x.strip(), principal_npvp.split(u',')) principal_npvp = u' '.join(principal_npvp) else: principal_npvp = u'' if not utils.is_nan(data[10]): npvp = unicodedata.normalize('NFKC', data[10]).strip() npvp = map(lambda x: x.strip(), npvp.split(u',')) npvp = u' '.join(npvp) else: npvp = u'' if not utils.is_nan(data[11]): verb = unicodedata.normalize('NFKC', data[11]).strip() verb = map(lambda x: x.strip(), verb.split(u',')) verb = u' '.join(verb) else: verb = u'' if not utils.is_nan(data[12]): wh_question = unicodedata.normalize('NFKC', data[12]).strip() wh_question = map(lambda x: x.strip(), wh_question.split(u',')) wh_question = u' '.join(wh_question) else: wh_question = u'' except: continue question = preprocessing(question, tokenize=False) s = normalize_space.sub(u' ', u' '.join([principal_npvp, npvp, verb, wh_question])) words = [] for w in question.lower().split(): if w not in s: continue words.append(w) pattern = u' '.join(words) pattern = preprocessing(pattern, tokenize=False) pattern = utils.normalize_abb(pattern) # bigram_pattern = utils.get_bigram_content(pattern) # pattern = u' '.join([pattern, bigram_pattern]) # patterns.update({pattern : response}) # question_full.update({pattern : question}) pattern = utils.normalize_abb(pattern) if cmnd in pattern: pattern = utils.emphasize_token(cmnd, pattern, n=3) if cccd in pattern: pattern = utils.emphasize_token(cccd, pattern, n=3) if pattern == u'': continue patterns.update({pattern: response}) question_full.update({pattern: question}) return patterns, question_full