def write_sample(sample_dict, outfile): ''' Reads data from a sample, or all of the test set, extracts features, and writes the features out in .csv format at path.PROCESSED. Args: sample_dict - None, or a dict like {filename: label} for every file in the sample. If None, runs the test set. Use a dict with the full training set to get training features. outfile - just the base, with no path or extension Writes: features in .csv format ''' start = datetime.now() outpath = os.path.join(paths.PROCESSED, outfile + '.csv') if sample_dict is not None: sample = zip_io.generate_sample(sample_dict) else: sample = zip_io.generate_test() fieldnames = ['file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct', 'head_script', 'body_script', 'head_style', 'body_style', 'head_link', 'body_link'] fieldnames.extend(BARE_TAGS) tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL] fieldnames.extend(tag_attr_val_names) tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR] fieldnames.extend(tag_attr_names) fieldnames.extend(TEXT_NAMES) fieldnames.extend(SCRIPT_FEATURES) script_url_names = ['script_url_' + url for url in URL_FEATURES] fieldnames.extend(script_url_names) script_path_names = ['script_path_' + p for p in PATH_FEATURES] fieldnames.extend(script_path_names) with open(outpath, 'w') as f_out: writer = DictWriter(f_out, fieldnames=fieldnames) writer.writeheader() for page_tuple in sample: row = {} row['file'] = page_tuple[0] row['sponsored'] = page_tuple[1] page = page_tuple[2] row['tag_ct'] = len(page.select('*')) row['head_tag_ct'] = len(page.select('head *')) row['body_tag_ct'] = len(page.select('body *')) row['head_script'] = len(page.select('head script')) row['body_script'] = len(page.select('body script')) row['head_style'] = len(page.select('head style')) row['body_style'] = len(page.select('body style')) row['head_link'] = len(page.select('head link')) row['body_link'] = len(page.select('body link')) add_bare_tags(row, page) add_tag_attr_vals(row, page) add_tag_attr(row, page) text_features(row, page) script_features(row, page) writer.writerow(row) finish = datetime.now() print 'Elapsed time: %d sec.' % (finish - start).seconds
def train_features(outfile): ''' Reads the training set, extracts features from it, and writes the features out in .csv format suitable for loading as a Pandas data frame. Args: outfile - features are written at paths.PROCESSED/<outfile>_train.csv Writes: features in .csv format ''' train_dict = artifacts.get_artifact('train_dict') data = zip_io.generate_sample(train_dict) write_features(data, outfile + '_train')
def sample_features(sample_name, outfile): ''' Reads data from a sample of the training set and writes the features out in LibSVM format. Args: sample - a bare name of a sample file without path or extension outfile - features are written at paths.PROCESSED/<outfile>.libsvm Writes: features in LibSVM format ''' sample_dict = artifacts.get_artifact(sample_name) sample = zip_io.generate_sample(sample_dict) write_features(sample, outfile)
def sample_features(sample_name, outfile): ''' Reads a sample of the training set, extracts features from it, and writes the features out in .csv format suitable for loading as a Pandas data frame. Args: sample - a bare name of a sample file without path or extension outfile - features are written at paths.PROCESSED/<outfile>.csv Writes: features in .csv format ''' sample_dict = artifacts.get_artifact(sample_name) sample = zip_io.generate_sample(sample_dict) write_features(sample, outfile)
def write_sample(sample_dict, outfile): ''' Reads data from a sample, or all of the test set, extracts features, and writes the features out in .csv format at path.PROCESSED. Args: sample_dict - None, or a dict like {filename: label} for every file in the sample. If None, runs the test set. Use a dict with the full training set to get training features. outfile - just the base, with no path or extension Writes: features in .csv format ''' start = datetime.now() outpath = os.path.join(paths.PROCESSED, outfile + '.csv') if sample_dict is not None: sample = zip_io.generate_sample(sample_dict) else: sample = zip_io.generate_test() fieldnames = [ 'file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct', 'head_script', 'body_script', 'head_style', 'body_style', 'head_link', 'body_link' ] fieldnames.extend(BARE_TAGS) tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL] fieldnames.extend(tag_attr_val_names) tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR] fieldnames.extend(tag_attr_names) fieldnames.extend(TEXT_NAMES) fieldnames.extend(SCRIPT_FEATURES) script_url_names = ['script_url_' + url for url in URL_FEATURES] fieldnames.extend(script_url_names) script_path_names = ['script_path_' + p for p in PATH_FEATURES] fieldnames.extend(script_path_names) with open(outpath, 'w') as f_out: writer = DictWriter(f_out, fieldnames=fieldnames) writer.writeheader() for page_tuple in sample: row = {} row['file'] = page_tuple[0] row['sponsored'] = page_tuple[1] page = page_tuple[2] row['tag_ct'] = len(page.select('*')) row['head_tag_ct'] = len(page.select('head *')) row['body_tag_ct'] = len(page.select('body *')) row['head_script'] = len(page.select('head script')) row['body_script'] = len(page.select('body script')) row['head_style'] = len(page.select('head style')) row['body_style'] = len(page.select('body style')) row['head_link'] = len(page.select('head link')) row['body_link'] = len(page.select('body link')) add_bare_tags(row, page) add_tag_attr_vals(row, page) add_tag_attr(row, page) text_features(row, page) script_features(row, page) writer.writerow(row) finish = datetime.now() print 'Elapsed time: %d sec.' % (finish - start).seconds
def get_counts(sample_base): """ Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, tag-attribute-value tuples, urls, paths, and tokens from script and style tags for every file in the sample. Args: sample_base - a bare sample name e.g sample20_20, which would read artifact/sample20_20.pkl Returns: a dict of Counter like {'type of thing': {'thing': count of thing}} """ sample_dict = artifacts.get_artifact(sample_base) sample = zip_io.generate_sample(sample_dict) tags = Counter() bigrams = Counter() attrs = Counter() tag_attrs = Counter() tag_attr_vals = Counter() urls = Counter() paths = Counter() script = Counter() style = Counter() ctrs = [tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style] for (k, page_tuple) in enumerate(sample): page = page_tuple[2] page_tags = set() page_bigrams = set() page_attrs = set() page_tag_attrs = set() page_tag_attr_vals = set() page_urls = set() page_paths = set() page_script = set() page_style = set() for tag in page.find_all(True): page_tags.add(tag.name) for child in tag.find_all(True, recursive=False): key = tag.name + "_" + child.name page_bigrams.add(key) for a in tag.attrs: page_attrs.add(a) key = tag.name + "_" + a page_tag_attrs.add(key) key = key + "_" + unicode(tag.attrs[a]) page_tag_attr_vals.add(key) if tag.name == "script": script_tokens = re.findall("\W(\w\w+)\W", tag.get_text()) for tok in script_tokens: page_script.add(tok) if tag.name == "style": style_tokens = re.findall("\W(\w\w+)\W", tag.get_text()) for tok in style_tokens: page_style.add(tok) srcs = page.select("[src]") hrefs = page.select("[href]") all_urls = [tag["src"] for tag in srcs] all_urls.extend([tag["href"] for tag in hrefs]) all_web = [] all_paths = [] for u in all_urls: try: all_web.append(urlparse(u).netloc) all_paths.append(urlparse(u).path) except ValueError: pass page_urls = set(all_web) page_paths = set(all_paths) for key in page_urls: urls[key] += 1 for key in page_paths: paths[key] += 1 for key in page_tags: tags[key] += 1 for key in page_bigrams: bigrams[key] += 1 for key in page_attrs: attrs[key] += 1 for key in page_tag_attrs: tag_attrs[key] += 1 for key in page_tag_attr_vals: tag_attr_vals[key] += 1 for key in page_script: script[key] += 1 for key in page_style: style[key] += 1 if (k + 1) % 1000 == 0: for ctr in ctrs: for key in ctr.keys(): if ctr[key] == 1: del ctr[key] out = { "tags": tags, "bigrams": bigrams, "attrs": attrs, "tag_attrs": tag_attrs, "tag_attr_vals": tag_attr_vals, "urls": urls, "paths": paths, "script": script, "style": style, } return out
def get_counts(sample_base): ''' Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, tag-attribute-value tuples, urls, paths, and tokens from script and style tags for every file in the sample. Args: sample_base - a bare sample name e.g sample20_20, which would read artifact/sample20_20.pkl Returns: a dict of Counter like {'type of thing': {'thing': count of thing}} ''' sample_dict = artifacts.get_artifact(sample_base) sample = zip_io.generate_sample(sample_dict) tags = Counter() bigrams = Counter() attrs = Counter() tag_attrs = Counter() tag_attr_vals = Counter() urls = Counter() paths = Counter() script = Counter() style = Counter() ctrs = [ tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style ] for (k, page_tuple) in enumerate(sample): page = page_tuple[2] page_tags = set() page_bigrams = set() page_attrs = set() page_tag_attrs = set() page_tag_attr_vals = set() page_urls = set() page_paths = set() page_script = set() page_style = set() for tag in page.find_all(True): page_tags.add(tag.name) for child in tag.find_all(True, recursive=False): key = tag.name + '_' + child.name page_bigrams.add(key) for a in tag.attrs: page_attrs.add(a) key = tag.name + '_' + a page_tag_attrs.add(key) key = key + '_' + unicode(tag.attrs[a]) page_tag_attr_vals.add(key) if tag.name == 'script': script_tokens = re.findall('\W(\w\w+)\W', tag.get_text()) for tok in script_tokens: page_script.add(tok) if tag.name == 'style': style_tokens = re.findall('\W(\w\w+)\W', tag.get_text()) for tok in style_tokens: page_style.add(tok) srcs = page.select('[src]') hrefs = page.select('[href]') all_urls = [tag['src'] for tag in srcs] all_urls.extend([tag['href'] for tag in hrefs]) all_web = [] all_paths = [] for u in all_urls: try: all_web.append(urlparse(u).netloc) all_paths.append(urlparse(u).path) except ValueError: pass page_urls = set(all_web) page_paths = set(all_paths) for key in page_urls: urls[key] += 1 for key in page_paths: paths[key] += 1 for key in page_tags: tags[key] += 1 for key in page_bigrams: bigrams[key] += 1 for key in page_attrs: attrs[key] += 1 for key in page_tag_attrs: tag_attrs[key] += 1 for key in page_tag_attr_vals: tag_attr_vals[key] += 1 for key in page_script: script[key] += 1 for key in page_style: style[key] += 1 if (k + 1) % 1000 == 0: for ctr in ctrs: for key in ctr.keys(): if ctr[key] == 1: del ctr[key] out = { 'tags': tags, 'bigrams': bigrams, 'attrs': attrs, 'tag_attrs': tag_attrs, 'tag_attr_vals': tag_attr_vals, 'urls': urls, 'paths': paths, 'script': script, 'style': style } return out