Example #1
0
def generate_train():
    '''
  Generator for the full training data. See one_archive.
  
  Generates:
    tuples like:
      (filename, label, soup, file size, zip file compressed size)
  '''
    train_dict = artifacts.get_artifact('train_dict')
    for archive_num in range(5):
        for item in one_archive(archive_num, train_dict):
            yield item
def train_features(outfile):
  '''
  Reads the training set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_train.csv
    
  Writes:
    features in .csv format
  '''
  train_dict = artifacts.get_artifact('train_dict')
  data = zip_io.generate_sample(train_dict)
  write_features(data, outfile + '_train')
Example #3
0
def train_features(outfile):
    '''
  Reads the training set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_train.csv
    
  Writes:
    features in .csv format
  '''
    train_dict = artifacts.get_artifact('train_dict')
    data = zip_io.generate_sample(train_dict)
    write_features(data, outfile + '_train')
Example #4
0
def sample_features(sample_name, outfile):
    '''
  Reads data from a sample of the training set and writes the features out
  in LibSVM format.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
    sample_dict = artifacts.get_artifact(sample_name)
    sample = zip_io.generate_sample(sample_dict)
    write_features(sample, outfile)
def sample_features(sample_name, outfile):
  '''
  Reads data from a sample of the training set and writes the features out
  in LibSVM format.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
  sample_dict = artifacts.get_artifact(sample_name)
  sample = zip_io.generate_sample(sample_dict)
  write_features(sample, outfile)
Example #6
0
def train_features(outfile):
    '''
  Reads the training set, extracts features from it, and writes the features 
  out in LibSVM format.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
    train_dict = artifacts.get_artifact('train_dict')
    for archive_num in range(5):
        data = zip_io.one_archive(archive_num, train_dict)
        batch_name = '%s.%d' % (outfile, archive_num)
        write_features(data, batch_name)
def sample_features(sample_name, outfile):
  '''
  Reads a sample of the training set, extracts features from it, 
  and writes the features out in .csv format suitable for loading 
  as a Pandas data frame.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.csv
    
  Writes:
    features in .csv format
  '''
  sample_dict = artifacts.get_artifact(sample_name)
  sample = zip_io.generate_sample(sample_dict)
  write_features(sample, outfile)
Example #8
0
def sample_features(sample_name, outfile):
    '''
  Reads a sample of the training set, extracts features from it, 
  and writes the features out in .csv format suitable for loading 
  as a Pandas data frame.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.csv
    
  Writes:
    features in .csv format
  '''
    sample_dict = artifacts.get_artifact(sample_name)
    sample = zip_io.generate_sample(sample_dict)
    write_features(sample, outfile)
def train_features(outfile):
  '''
  Reads the training set, extracts features from it, and writes the features 
  out in LibSVM format.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
  train_dict = artifacts.get_artifact('train_dict')
  for archive_num in range(5):
    data = zip_io.one_archive(archive_num, train_dict)
    batch_name = '%s.%d' % (outfile, archive_num)
    write_features(data, batch_name)
def load_counts():
  '''
  Loads a dict of Counters like {'type of thing': 'thing': count of thing}
  The dict is produced by counts.get_counts.
  
  Returns:
    a namedtuple of sets of the items of each type that had a document
    frequency above threshold in the sample
  '''
  counts = artifacts.get_artifact('counts')
  Counters = namedtuple('Counters', counts.keys())
  for ctr_name in counts:
    ctr = counts[ctr_name]
    if ctr_name in ['tags', 'urls']:
      thr = 400
    elif ctr_name == 'script':
      thr = 10000
    else:
      thr = 4000
    counts[ctr_name] = {key for key in ctr if ctr[key] > thr}
  out = Counters(**counts)
  return out
Example #11
0
def load_counts():
    '''
  Loads a dict of Counters like {'type of thing': 'thing': count of thing}
  The dict is produced by counts.get_counts.
  
  Returns:
    a namedtuple of sets of the items of each type that had a document
    frequency above threshold in the sample
  '''
    counts = artifacts.get_artifact('counts')
    Counters = namedtuple('Counters', counts.keys())
    for ctr_name in counts:
        ctr = counts[ctr_name]
        if ctr_name in ['tags', 'urls']:
            thr = 400
        elif ctr_name == 'script':
            thr = 10000
        else:
            thr = 4000
        counts[ctr_name] = {key for key in ctr if ctr[key] > thr}
    out = Counters(**counts)
    return out
Example #12
0
    urls = [u for u in urls if len(u) > 0]
    row['script_urls'] = len(urls)
    row['script_distinct_urls'] = len(set(urls))
    for url in URL_FEATURES:
        key = 'script_url_' + url
        row[key] = sum([url in s for s in urls])
    paths = [urlparse(s).path for s in srcs]
    for path_part in PATH_FEATURES:
        key = 'script_path_' + path_part
        row[key] = sum([path_part in s for s in paths])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Write sample of training data as .csv file at paths.ARTIFACTS')
    parser.add_argument(
        'outfile',
        type=str,
        help='Data matrix written at paths/PROCESSED/<outfile>.csv')
    parser.add_argument('--sample',
                        type=str,
                        help='filename of sample dict at paths/ARTIFACTS')
    args = parser.parse_args()

    if args.sample is not None:
        sample_dict = artifacts.get_artifact(args.sample)
        write_sample(sample_dict, args.outfile)
    else:
        write_sample(None, args.outfile)
  row['script_max_braces'] = safemax(braces)
  srcs = [tag['src'] for tag in page.select('script[src]')]
  urls = [urlparse(s).netloc for s in srcs]
  urls = [u for u in urls if len(u) > 0]
  row['script_urls'] = len(urls)
  row['script_distinct_urls'] = len(set(urls))
  for url in URL_FEATURES:
    key = 'script_url_' + url
    row[key] = sum([url in s for s in urls])
  paths = [urlparse(s).path for s in srcs]
  for path_part in PATH_FEATURES:
    key = 'script_path_' + path_part
    row[key] = sum([path_part in s for s in paths])


if __name__ == '__main__':
  parser = argparse.ArgumentParser(description =
             'Write sample of training data as .csv file at paths.ARTIFACTS')
  parser.add_argument('outfile', type=str, help = 
           'Data matrix written at paths/PROCESSED/<outfile>.csv')
  parser.add_argument('--sample', type=str, help = 
          'filename of sample dict at paths/ARTIFACTS')
  args = parser.parse_args()
  
  if args.sample is not None:
    sample_dict = artifacts.get_artifact(args.sample)
    write_sample(sample_dict, args.outfile)
  else:
    write_sample(None, args.outfile)

Example #14
0
def get_counts(sample_base):
    """
  Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, 
  tag-attribute-value tuples, urls, paths, and tokens from script and 
  style tags for every file in the sample.
  
  Args:
    sample_base - a bare sample name e.g sample20_20, which would read
        artifact/sample20_20.pkl
  Returns:
    a dict of Counter like {'type of thing': {'thing': count of thing}}
  """
    sample_dict = artifacts.get_artifact(sample_base)
    sample = zip_io.generate_sample(sample_dict)

    tags = Counter()
    bigrams = Counter()
    attrs = Counter()
    tag_attrs = Counter()
    tag_attr_vals = Counter()
    urls = Counter()
    paths = Counter()
    script = Counter()
    style = Counter()
    ctrs = [tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style]

    for (k, page_tuple) in enumerate(sample):
        page = page_tuple[2]

        page_tags = set()
        page_bigrams = set()
        page_attrs = set()
        page_tag_attrs = set()
        page_tag_attr_vals = set()
        page_urls = set()
        page_paths = set()
        page_script = set()
        page_style = set()

        for tag in page.find_all(True):
            page_tags.add(tag.name)
            for child in tag.find_all(True, recursive=False):
                key = tag.name + "_" + child.name
                page_bigrams.add(key)
            for a in tag.attrs:
                page_attrs.add(a)
                key = tag.name + "_" + a
                page_tag_attrs.add(key)
                key = key + "_" + unicode(tag.attrs[a])
                page_tag_attr_vals.add(key)
            if tag.name == "script":
                script_tokens = re.findall("\W(\w\w+)\W", tag.get_text())
                for tok in script_tokens:
                    page_script.add(tok)
            if tag.name == "style":
                style_tokens = re.findall("\W(\w\w+)\W", tag.get_text())
                for tok in style_tokens:
                    page_style.add(tok)

        srcs = page.select("[src]")
        hrefs = page.select("[href]")
        all_urls = [tag["src"] for tag in srcs]
        all_urls.extend([tag["href"] for tag in hrefs])
        all_web = []
        all_paths = []
        for u in all_urls:
            try:
                all_web.append(urlparse(u).netloc)
                all_paths.append(urlparse(u).path)
            except ValueError:
                pass
        page_urls = set(all_web)
        page_paths = set(all_paths)

        for key in page_urls:
            urls[key] += 1
        for key in page_paths:
            paths[key] += 1
        for key in page_tags:
            tags[key] += 1
        for key in page_bigrams:
            bigrams[key] += 1
        for key in page_attrs:
            attrs[key] += 1
        for key in page_tag_attrs:
            tag_attrs[key] += 1
        for key in page_tag_attr_vals:
            tag_attr_vals[key] += 1
        for key in page_script:
            script[key] += 1
        for key in page_style:
            style[key] += 1

        if (k + 1) % 1000 == 0:
            for ctr in ctrs:
                for key in ctr.keys():
                    if ctr[key] == 1:
                        del ctr[key]

    out = {
        "tags": tags,
        "bigrams": bigrams,
        "attrs": attrs,
        "tag_attrs": tag_attrs,
        "tag_attr_vals": tag_attr_vals,
        "urls": urls,
        "paths": paths,
        "script": script,
        "style": style,
    }

    return out
Example #15
0
def get_counts(sample_base):
    '''
  Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, 
  tag-attribute-value tuples, urls, paths, and tokens from script and 
  style tags for every file in the sample.
  
  Args:
    sample_base - a bare sample name e.g sample20_20, which would read
        artifact/sample20_20.pkl
  Returns:
    a dict of Counter like {'type of thing': {'thing': count of thing}}
  '''
    sample_dict = artifacts.get_artifact(sample_base)
    sample = zip_io.generate_sample(sample_dict)

    tags = Counter()
    bigrams = Counter()
    attrs = Counter()
    tag_attrs = Counter()
    tag_attr_vals = Counter()
    urls = Counter()
    paths = Counter()
    script = Counter()
    style = Counter()
    ctrs = [
        tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script,
        style
    ]

    for (k, page_tuple) in enumerate(sample):
        page = page_tuple[2]

        page_tags = set()
        page_bigrams = set()
        page_attrs = set()
        page_tag_attrs = set()
        page_tag_attr_vals = set()
        page_urls = set()
        page_paths = set()
        page_script = set()
        page_style = set()

        for tag in page.find_all(True):
            page_tags.add(tag.name)
            for child in tag.find_all(True, recursive=False):
                key = tag.name + '_' + child.name
                page_bigrams.add(key)
            for a in tag.attrs:
                page_attrs.add(a)
                key = tag.name + '_' + a
                page_tag_attrs.add(key)
                key = key + '_' + unicode(tag.attrs[a])
                page_tag_attr_vals.add(key)
            if tag.name == 'script':
                script_tokens = re.findall('\W(\w\w+)\W', tag.get_text())
                for tok in script_tokens:
                    page_script.add(tok)
            if tag.name == 'style':
                style_tokens = re.findall('\W(\w\w+)\W', tag.get_text())
                for tok in style_tokens:
                    page_style.add(tok)

        srcs = page.select('[src]')
        hrefs = page.select('[href]')
        all_urls = [tag['src'] for tag in srcs]
        all_urls.extend([tag['href'] for tag in hrefs])
        all_web = []
        all_paths = []
        for u in all_urls:
            try:
                all_web.append(urlparse(u).netloc)
                all_paths.append(urlparse(u).path)
            except ValueError:
                pass
        page_urls = set(all_web)
        page_paths = set(all_paths)

        for key in page_urls:
            urls[key] += 1
        for key in page_paths:
            paths[key] += 1
        for key in page_tags:
            tags[key] += 1
        for key in page_bigrams:
            bigrams[key] += 1
        for key in page_attrs:
            attrs[key] += 1
        for key in page_tag_attrs:
            tag_attrs[key] += 1
        for key in page_tag_attr_vals:
            tag_attr_vals[key] += 1
        for key in page_script:
            script[key] += 1
        for key in page_style:
            style[key] += 1

        if (k + 1) % 1000 == 0:
            for ctr in ctrs:
                for key in ctr.keys():
                    if ctr[key] == 1:
                        del ctr[key]

    out = {
        'tags': tags,
        'bigrams': bigrams,
        'attrs': attrs,
        'tag_attrs': tag_attrs,
        'tag_attr_vals': tag_attr_vals,
        'urls': urls,
        'paths': paths,
        'script': script,
        'style': style
    }

    return out