Esempio n. 1
0
def make_json(line):
    client_headers = line.get('client_header_names')
    if is_nan(client_headers):
        client_headers = list()
    server_headers = line.get('server_header_names')
    if is_nan(server_headers):
        server_headers = list()

    headers = list()
    headers.extend(filter(lambda header: not is_nan(header), client_headers))
    headers.extend(filter(lambda header: not is_nan(header), server_headers))
    return ','.join(filter(lambda header: len(header), headers))
Esempio n. 2
0
def make_url(line):
    host = line.get('host')
    if is_nan(host):
        host = str()
    uri = line.get('uri')
    if is_nan(uri):
        uri = str()
    url = urllib.parse.urljoin(host, uri)

    port = int(line['id.resp_p'])
    if port == 80:
        base = 'http://%s' % line['id.resp_h']
    else:
        base = 'http://%s:%s' % (line['id.resp_h'], line['id.resp_p'])
    return urllib.parse.urljoin(base, url)
Esempio n. 3
0
def tokenize_from_tsv(tokenizer_name: str,
                      input_path: str,
                      output_path: str,
                      y_index: int = 0,
                      x_index: int = 1,
                      y_header: str = "label",
                      x_header: str = "text") -> None:
    """
    Tokenizing on input_path file and saving to output_path file

    Args:
        
    """

    tokenizer = get_tokenizer(tokenizer_name)
    df = pd.read_csv(input_path, header=0, sep="\t")
    total = len(df)
    print(">> Strart Tokenizing This File Like Below...")
    print(df.head(-10))

    with open(output_path, "w", encoding="utf-8") as f1:
        f1.writelines(y_header + "\t" + x_header + "\n")
        row_iterator = df.iterrows()

        for index, row in tqdm(row_iterator, total=total):
            sentence = row[x_index]
            label = row[y_index]

            if is_nan(sentence) or is_nan(label):
                continue
            replaced = label.replace(" ", "_")
            sentence = sentence.replace("\n", "").strip()

            tokens = tokenizer(sentence)
            tokenized_sent = " ".join(_post_processing(tokens))
            if is_nan(tokens) or tokens == "":
                continue

            f1.writelines(replaced + "\t" + tokenized_sent + "\n")
    f1.close()
Esempio n. 4
0
def communicate(log_root):
    log_file = os.path.join(log_root, 'files.log')
    if not os.path.isfile(log_file):
        return

    LOG_FILE = parse(log_file)
    for line in LOG_FILE.context.itertuples():
        if is_nan(getattr(line, 'extracted', None)):
            continue

        local_name = line.extracted
        dump_path = os.path.join(DUMP_PATH, local_name)
        if not os.path.exists(dump_path):
            warnings.warn(f'No such file or directory: {local_name!r}',
                          ExtractWarning)
            return
Esempio n. 5
0
def generate_log(log_name):
    global DATE
    date = time.strftime('%Y-%m-%d')
    if date != DATE:
        archive(DATE)
        DATE = date
    INFO = os.path.join(LOGS_PATH, 'info', f'{DATE}.log')

    log_stem = log_name
    log_root = os.path.join(LOGS_PATH, log_name)
    log_uuid = re.match(
        r'.*?-(?P<uuid>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})',
        log_stem, re.IGNORECASE).group('uuid')

    log_file = os.path.join(log_root, 'files.log')
    if not os.path.isfile(log_file):
        return

    LOG_FILE = parse(log_file)
    LOG_CONN = parse(os.path.join(log_root, 'conn.log'))
    for line in LOG_FILE.context.itertuples():
        if is_nan(getattr(line, 'extracted', None)):
            continue
        hosts = [
            dict(tx=ipaddress.ip_address(tx), rx=ipaddress.ip_address(rx))
            for (tx, rx) in zip(line.tx_hosts, line.rx_hosts)
        ]

        conns = list()
        is_orig = line.is_orig
        for conn_uid in line.conn_uids:
            record = next(
                LOG_CONN.context[lambda df: df.uid == conn_uid].iterrows())[1]  # pylint: disable=cell-var-from-loop
            if is_orig:
                conn = dict(
                    src_h=ipaddress.ip_address(record['id.orig_h']),
                    src_p=int(record['id.orig_p']),
                    dst_h=ipaddress.ip_address(record['id.resp_h']),
                    dst_p=int(record['id.resp_p']),
                )
            else:
                conn = dict(
                    src_h=ipaddress.ip_address(record['id.resp_h']),
                    src_p=int(record['id.resp_p']),
                    dst_h=ipaddress.ip_address(record['id.orig_h']),
                    dst_p=int(record['id.orig_p']),
                )
            conns.append(conn)

        local_name = line.extracted
        mime_type = None
        dump_path = os.path.join(DUMP_PATH, local_name)
        if os.path.exists(dump_path):
            with contextlib.suppress(Exception):
                mime_type = magic.detect_from_filename(dump_path).mime_type
            # if mime_type is None or MIME_REGEX.match(mime_type) is None:
            #     if MIME_MODE:
            #         local_name = rename_dump(local_name, line.mime_type)
            # else:
            #     if MIME_MODE or (mime_type != line.mime_type):  # pylint: disable=else-if-used
            #         local_name = rename_dump(local_name, mime_type)
        else:
            dump_path = None

        info = dict(timestamp=line.ts
                    if LOG_FILE.format == 'json' else line.ts.timestamp(),
                    log_uuid=log_uuid,
                    log_path=log_root,
                    log_name=log_stem,
                    dump_path=dump_path,
                    local_name=local_name,
                    source_name=getattr(line, 'filename', None),
                    hosts=hosts,
                    conns=conns,
                    bro_mime_type=line.mime_type,
                    real_mime_type=mime_type,
                    hash=dict(
                        md5=getattr(line, 'md5', None),
                        sha1=getattr(line, 'sha1', None),
                        sha256=getattr(line, 'sha256', None),
                    ))
        print_file(json.dumps(info, cls=IPAddressJSONEncoder), file=INFO)
Esempio n. 6
0
 def write(self, col, row, data, cell_format):
     if not (data is None or is_inf(data) or is_nan(data)):  # nan, inf, None check
         self.sheet.write(col, row, data, cell_format)
     else:
         pass
Esempio n. 7
0
def make_b64(data):
    if is_nan(data):
        return None
    return base64.b64encode(data.encode()).decode()
Esempio n. 8
0
def load_business_data(data_path):
    patterns = {}
    question_full = {}
    df = pd.read_excel(data_path, header=0, sheet_name=u'Working Sheet đợt 1')
    for row in df.iterrows():
        try:
            data = row[1]
            question = unicodedata.normalize('NFKC', data[2])
            response = unicodedata.normalize('NFKC', data[15])
            revise = data[8]

            if utils.is_nan(revise):
                if not utils.is_nan(data[3]):
                    principal_npvp = unicodedata.normalize('NFKC', data[3]).strip()
                    principal_npvp = map(lambda x: x.strip(), principal_npvp.split(u','))
                    principal_npvp = u' '.join(principal_npvp)
                else:
                    principal_npvp = u''

                if not utils.is_nan(data[4]):
                    npvp = unicodedata.normalize('NFKC', data[4]).strip()
                    npvp = map(lambda x: x.strip(), npvp.split(u','))
                    npvp = u' '.join(npvp)
                else:
                    npvp = u''

                if not utils.is_nan(data[5]):
                    verb = unicodedata.normalize('NFKC', data[5]).strip()
                    verb = map(lambda x: x.strip(), verb.split(u','))
                    verb = u' '.join(verb)
                else:
                    verb = u''

                if not utils.is_nan(data[6]):
                    wh_question = unicodedata.normalize('NFKC', data[6]).strip()
                    wh_question = map(lambda x: x.strip(), wh_question.split(u','))
                    wh_question = u' '.join(wh_question)
                else:
                    wh_question = u''
            else:
                if not utils.is_nan(data[9]):
                    principal_npvp = unicodedata.normalize('NFKC', data[9]).strip()
                    principal_npvp = map(lambda x: x.strip(), principal_npvp.split(u','))
                    principal_npvp = u' '.join(principal_npvp)
                else:
                    principal_npvp = u''

                if not utils.is_nan(data[10]):
                    npvp = unicodedata.normalize('NFKC', data[10]).strip()
                    npvp = map(lambda x: x.strip(), npvp.split(u','))
                    npvp = u' '.join(npvp)
                else:
                    npvp = u''

                if not utils.is_nan(data[11]):
                    verb = unicodedata.normalize('NFKC', data[11]).strip()
                    verb = map(lambda x: x.strip(), verb.split(u','))
                    verb = u' '.join(verb)
                else:
                    verb = u''

                if not utils.is_nan(data[12]):
                    wh_question = unicodedata.normalize('NFKC', data[12]).strip()
                    wh_question = map(lambda x: x.strip(), wh_question.split(u','))
                    wh_question = u' '.join(wh_question)
                else:
                    wh_question = u''
        except:
            continue

        question = preprocessing(question, tokenize=False)
        s = normalize_space.sub(u' ', u' '.join([principal_npvp, npvp, verb, wh_question]))
        words = []
        for w in question.lower().split():
            if w not in s:
                continue
            words.append(w)

        pattern = u' '.join(words)
        pattern = preprocessing(pattern, tokenize=False)
        pattern = utils.normalize_abb(pattern)
        # bigram_pattern = utils.get_bigram_content(pattern)
        # pattern = u' '.join([pattern, bigram_pattern])

        # patterns.update({pattern : response})
        # question_full.update({pattern : question})

        pattern = utils.normalize_abb(pattern)

        if cmnd in pattern:
            pattern = utils.emphasize_token(cmnd, pattern, n=3)
        if cccd in pattern:
            pattern = utils.emphasize_token(cccd, pattern, n=3)

        if pattern == u'':
            continue

        patterns.update({pattern: response})
        question_full.update({pattern: question})

    return patterns, question_full