def gen_docs(jsons: [{}], char_wb: bool = False, add_taint: bool = False) -> [Learner.LabelledDocs]: """ Generate string list from the flow URLs. :param jsons: The flow jsons. :param char_wb: :param add_taint: Whether add taints as tokens. :return: """ docs = [] taint_counts = 0 for flow in jsons: line = Analyzer.filter_url_words(flow['url']) if '_' in flow['taint']: taint_counts += 1 if add_taint: line = line + ' ' + 't_' + flow['taint'] label = 1 if flow['label'] == '1' else 0 real_label = 1 if flow['real_label'] == '1' else 0 if real_label != label: logger.info( "Flow's real label does not match the training label for %s, real_label = %d label = %d", flow['url'], real_label, label) numeric = [flow[name] for name in Analyzer.numeric_features] docs.append( Learner.LabelledDocs(line, label, numeric, real_label, char_wb=char_wb)) logger.info('The number of flows who have more than 1 taints: %d', taint_counts) return docs