def generate_train(): ''' Generator for the full training data. See one_archive. Generates: tuples like: (filename, label, soup, file size, zip file compressed size) ''' train_dict = artifacts.get_artifact('train_dict') for archive_num in range(5): for item in one_archive(archive_num, train_dict): yield item
def train_features(outfile): ''' Reads the training set, extracts features from it, and writes the features out in .csv format suitable for loading as a Pandas data frame. Args: outfile - features are written at paths.PROCESSED/<outfile>_train.csv Writes: features in .csv format ''' train_dict = artifacts.get_artifact('train_dict') data = zip_io.generate_sample(train_dict) write_features(data, outfile + '_train')
def sample_features(sample_name, outfile): ''' Reads data from a sample of the training set and writes the features out in LibSVM format. Args: sample - a bare name of a sample file without path or extension outfile - features are written at paths.PROCESSED/<outfile>.libsvm Writes: features in LibSVM format ''' sample_dict = artifacts.get_artifact(sample_name) sample = zip_io.generate_sample(sample_dict) write_features(sample, outfile)
def train_features(outfile): ''' Reads the training set, extracts features from it, and writes the features out in LibSVM format. Args: outfile - features are written at paths.PROCESSED/<outfile>.libsvm Writes: features in LibSVM format ''' train_dict = artifacts.get_artifact('train_dict') for archive_num in range(5): data = zip_io.one_archive(archive_num, train_dict) batch_name = '%s.%d' % (outfile, archive_num) write_features(data, batch_name)
def sample_features(sample_name, outfile): ''' Reads a sample of the training set, extracts features from it, and writes the features out in .csv format suitable for loading as a Pandas data frame. Args: sample - a bare name of a sample file without path or extension outfile - features are written at paths.PROCESSED/<outfile>.csv Writes: features in .csv format ''' sample_dict = artifacts.get_artifact(sample_name) sample = zip_io.generate_sample(sample_dict) write_features(sample, outfile)
def load_counts(): ''' Loads a dict of Counters like {'type of thing': 'thing': count of thing} The dict is produced by counts.get_counts. Returns: a namedtuple of sets of the items of each type that had a document frequency above threshold in the sample ''' counts = artifacts.get_artifact('counts') Counters = namedtuple('Counters', counts.keys()) for ctr_name in counts: ctr = counts[ctr_name] if ctr_name in ['tags', 'urls']: thr = 400 elif ctr_name == 'script': thr = 10000 else: thr = 4000 counts[ctr_name] = {key for key in ctr if ctr[key] > thr} out = Counters(**counts) return out
urls = [u for u in urls if len(u) > 0] row['script_urls'] = len(urls) row['script_distinct_urls'] = len(set(urls)) for url in URL_FEATURES: key = 'script_url_' + url row[key] = sum([url in s for s in urls]) paths = [urlparse(s).path for s in srcs] for path_part in PATH_FEATURES: key = 'script_path_' + path_part row[key] = sum([path_part in s for s in paths]) if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Write sample of training data as .csv file at paths.ARTIFACTS') parser.add_argument( 'outfile', type=str, help='Data matrix written at paths/PROCESSED/<outfile>.csv') parser.add_argument('--sample', type=str, help='filename of sample dict at paths/ARTIFACTS') args = parser.parse_args() if args.sample is not None: sample_dict = artifacts.get_artifact(args.sample) write_sample(sample_dict, args.outfile) else: write_sample(None, args.outfile)
row['script_max_braces'] = safemax(braces) srcs = [tag['src'] for tag in page.select('script[src]')] urls = [urlparse(s).netloc for s in srcs] urls = [u for u in urls if len(u) > 0] row['script_urls'] = len(urls) row['script_distinct_urls'] = len(set(urls)) for url in URL_FEATURES: key = 'script_url_' + url row[key] = sum([url in s for s in urls]) paths = [urlparse(s).path for s in srcs] for path_part in PATH_FEATURES: key = 'script_path_' + path_part row[key] = sum([path_part in s for s in paths]) if __name__ == '__main__': parser = argparse.ArgumentParser(description = 'Write sample of training data as .csv file at paths.ARTIFACTS') parser.add_argument('outfile', type=str, help = 'Data matrix written at paths/PROCESSED/<outfile>.csv') parser.add_argument('--sample', type=str, help = 'filename of sample dict at paths/ARTIFACTS') args = parser.parse_args() if args.sample is not None: sample_dict = artifacts.get_artifact(args.sample) write_sample(sample_dict, args.outfile) else: write_sample(None, args.outfile)
def get_counts(sample_base): """ Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, tag-attribute-value tuples, urls, paths, and tokens from script and style tags for every file in the sample. Args: sample_base - a bare sample name e.g sample20_20, which would read artifact/sample20_20.pkl Returns: a dict of Counter like {'type of thing': {'thing': count of thing}} """ sample_dict = artifacts.get_artifact(sample_base) sample = zip_io.generate_sample(sample_dict) tags = Counter() bigrams = Counter() attrs = Counter() tag_attrs = Counter() tag_attr_vals = Counter() urls = Counter() paths = Counter() script = Counter() style = Counter() ctrs = [tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style] for (k, page_tuple) in enumerate(sample): page = page_tuple[2] page_tags = set() page_bigrams = set() page_attrs = set() page_tag_attrs = set() page_tag_attr_vals = set() page_urls = set() page_paths = set() page_script = set() page_style = set() for tag in page.find_all(True): page_tags.add(tag.name) for child in tag.find_all(True, recursive=False): key = tag.name + "_" + child.name page_bigrams.add(key) for a in tag.attrs: page_attrs.add(a) key = tag.name + "_" + a page_tag_attrs.add(key) key = key + "_" + unicode(tag.attrs[a]) page_tag_attr_vals.add(key) if tag.name == "script": script_tokens = re.findall("\W(\w\w+)\W", tag.get_text()) for tok in script_tokens: page_script.add(tok) if tag.name == "style": style_tokens = re.findall("\W(\w\w+)\W", tag.get_text()) for tok in style_tokens: page_style.add(tok) srcs = page.select("[src]") hrefs = page.select("[href]") all_urls = [tag["src"] for tag in srcs] all_urls.extend([tag["href"] for tag in hrefs]) all_web = [] all_paths = [] for u in all_urls: try: all_web.append(urlparse(u).netloc) all_paths.append(urlparse(u).path) except ValueError: pass page_urls = set(all_web) page_paths = set(all_paths) for key in page_urls: urls[key] += 1 for key in page_paths: paths[key] += 1 for key in page_tags: tags[key] += 1 for key in page_bigrams: bigrams[key] += 1 for key in page_attrs: attrs[key] += 1 for key in page_tag_attrs: tag_attrs[key] += 1 for key in page_tag_attr_vals: tag_attr_vals[key] += 1 for key in page_script: script[key] += 1 for key in page_style: style[key] += 1 if (k + 1) % 1000 == 0: for ctr in ctrs: for key in ctr.keys(): if ctr[key] == 1: del ctr[key] out = { "tags": tags, "bigrams": bigrams, "attrs": attrs, "tag_attrs": tag_attrs, "tag_attr_vals": tag_attr_vals, "urls": urls, "paths": paths, "script": script, "style": style, } return out
def get_counts(sample_base): ''' Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, tag-attribute-value tuples, urls, paths, and tokens from script and style tags for every file in the sample. Args: sample_base - a bare sample name e.g sample20_20, which would read artifact/sample20_20.pkl Returns: a dict of Counter like {'type of thing': {'thing': count of thing}} ''' sample_dict = artifacts.get_artifact(sample_base) sample = zip_io.generate_sample(sample_dict) tags = Counter() bigrams = Counter() attrs = Counter() tag_attrs = Counter() tag_attr_vals = Counter() urls = Counter() paths = Counter() script = Counter() style = Counter() ctrs = [ tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style ] for (k, page_tuple) in enumerate(sample): page = page_tuple[2] page_tags = set() page_bigrams = set() page_attrs = set() page_tag_attrs = set() page_tag_attr_vals = set() page_urls = set() page_paths = set() page_script = set() page_style = set() for tag in page.find_all(True): page_tags.add(tag.name) for child in tag.find_all(True, recursive=False): key = tag.name + '_' + child.name page_bigrams.add(key) for a in tag.attrs: page_attrs.add(a) key = tag.name + '_' + a page_tag_attrs.add(key) key = key + '_' + unicode(tag.attrs[a]) page_tag_attr_vals.add(key) if tag.name == 'script': script_tokens = re.findall('\W(\w\w+)\W', tag.get_text()) for tok in script_tokens: page_script.add(tok) if tag.name == 'style': style_tokens = re.findall('\W(\w\w+)\W', tag.get_text()) for tok in style_tokens: page_style.add(tok) srcs = page.select('[src]') hrefs = page.select('[href]') all_urls = [tag['src'] for tag in srcs] all_urls.extend([tag['href'] for tag in hrefs]) all_web = [] all_paths = [] for u in all_urls: try: all_web.append(urlparse(u).netloc) all_paths.append(urlparse(u).path) except ValueError: pass page_urls = set(all_web) page_paths = set(all_paths) for key in page_urls: urls[key] += 1 for key in page_paths: paths[key] += 1 for key in page_tags: tags[key] += 1 for key in page_bigrams: bigrams[key] += 1 for key in page_attrs: attrs[key] += 1 for key in page_tag_attrs: tag_attrs[key] += 1 for key in page_tag_attr_vals: tag_attr_vals[key] += 1 for key in page_script: script[key] += 1 for key in page_style: style[key] += 1 if (k + 1) % 1000 == 0: for ctr in ctrs: for key in ctr.keys(): if ctr[key] == 1: del ctr[key] out = { 'tags': tags, 'bigrams': bigrams, 'attrs': attrs, 'tag_attrs': tag_attrs, 'tag_attr_vals': tag_attr_vals, 'urls': urls, 'paths': paths, 'script': script, 'style': style } return out