def write_sample(sample_dict, outfile):
  '''
  Reads data from a sample, or all of the test set, extracts features, 
  and writes the features out in .csv format at path.PROCESSED.
  
  Args:
    sample_dict - None, or a dict like {filename: label} for every file 
        in the sample. If None, runs the test set. Use a dict with the 
        full training set to get training features.
    outfile - just the base, with no path or extension
    
  Writes:
    features in .csv format
  '''
  start = datetime.now()
  outpath = os.path.join(paths.PROCESSED, outfile + '.csv')
  if sample_dict is not None:
    sample = zip_io.generate_sample(sample_dict)
  else:
    sample = zip_io.generate_test()
  fieldnames = ['file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct',
                'head_script', 'body_script', 'head_style', 'body_style',
                'head_link', 'body_link']
  fieldnames.extend(BARE_TAGS)
  tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL]
  fieldnames.extend(tag_attr_val_names)
  tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR]
  fieldnames.extend(tag_attr_names)
  fieldnames.extend(TEXT_NAMES)
  fieldnames.extend(SCRIPT_FEATURES)
  script_url_names = ['script_url_' + url for url in URL_FEATURES]
  fieldnames.extend(script_url_names)
  script_path_names = ['script_path_' + p for p in PATH_FEATURES]
  fieldnames.extend(script_path_names)
  with open(outpath, 'w') as f_out:
    writer = DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()
    for page_tuple in sample:
      row = {}
      row['file'] = page_tuple[0]
      row['sponsored'] = page_tuple[1]
      page = page_tuple[2]
      row['tag_ct'] = len(page.select('*'))
      row['head_tag_ct'] = len(page.select('head *'))
      row['body_tag_ct'] = len(page.select('body *'))
      row['head_script'] = len(page.select('head script'))
      row['body_script'] = len(page.select('body script'))
      row['head_style'] = len(page.select('head style'))
      row['body_style'] = len(page.select('body style'))
      row['head_link'] = len(page.select('head link'))
      row['body_link'] = len(page.select('body link'))
      add_bare_tags(row, page)
      add_tag_attr_vals(row, page)
      add_tag_attr(row, page)
      text_features(row, page)
      script_features(row, page)
      writer.writerow(row)
  finish = datetime.now()
  print 'Elapsed time: %d sec.' % (finish - start).seconds
def train_features(outfile):
  '''
  Reads the training set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_train.csv
    
  Writes:
    features in .csv format
  '''
  train_dict = artifacts.get_artifact('train_dict')
  data = zip_io.generate_sample(train_dict)
  write_features(data, outfile + '_train')
Exemple #3
0
def train_features(outfile):
    '''
  Reads the training set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_train.csv
    
  Writes:
    features in .csv format
  '''
    train_dict = artifacts.get_artifact('train_dict')
    data = zip_io.generate_sample(train_dict)
    write_features(data, outfile + '_train')
Exemple #4
0
def sample_features(sample_name, outfile):
    '''
  Reads data from a sample of the training set and writes the features out
  in LibSVM format.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
    sample_dict = artifacts.get_artifact(sample_name)
    sample = zip_io.generate_sample(sample_dict)
    write_features(sample, outfile)
def sample_features(sample_name, outfile):
  '''
  Reads data from a sample of the training set and writes the features out
  in LibSVM format.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
  sample_dict = artifacts.get_artifact(sample_name)
  sample = zip_io.generate_sample(sample_dict)
  write_features(sample, outfile)
def sample_features(sample_name, outfile):
  '''
  Reads a sample of the training set, extracts features from it, 
  and writes the features out in .csv format suitable for loading 
  as a Pandas data frame.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.csv
    
  Writes:
    features in .csv format
  '''
  sample_dict = artifacts.get_artifact(sample_name)
  sample = zip_io.generate_sample(sample_dict)
  write_features(sample, outfile)
Exemple #7
0
def sample_features(sample_name, outfile):
    '''
  Reads a sample of the training set, extracts features from it, 
  and writes the features out in .csv format suitable for loading 
  as a Pandas data frame.
  
  Args:
    sample - a bare name of a sample file without path or extension
    outfile - features are written at paths.PROCESSED/<outfile>.csv
    
  Writes:
    features in .csv format
  '''
    sample_dict = artifacts.get_artifact(sample_name)
    sample = zip_io.generate_sample(sample_dict)
    write_features(sample, outfile)
Exemple #8
0
def write_sample(sample_dict, outfile):
    '''
  Reads data from a sample, or all of the test set, extracts features, 
  and writes the features out in .csv format at path.PROCESSED.
  
  Args:
    sample_dict - None, or a dict like {filename: label} for every file 
        in the sample. If None, runs the test set. Use a dict with the 
        full training set to get training features.
    outfile - just the base, with no path or extension
    
  Writes:
    features in .csv format
  '''
    start = datetime.now()
    outpath = os.path.join(paths.PROCESSED, outfile + '.csv')
    if sample_dict is not None:
        sample = zip_io.generate_sample(sample_dict)
    else:
        sample = zip_io.generate_test()
    fieldnames = [
        'file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct',
        'head_script', 'body_script', 'head_style', 'body_style', 'head_link',
        'body_link'
    ]
    fieldnames.extend(BARE_TAGS)
    tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL]
    fieldnames.extend(tag_attr_val_names)
    tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR]
    fieldnames.extend(tag_attr_names)
    fieldnames.extend(TEXT_NAMES)
    fieldnames.extend(SCRIPT_FEATURES)
    script_url_names = ['script_url_' + url for url in URL_FEATURES]
    fieldnames.extend(script_url_names)
    script_path_names = ['script_path_' + p for p in PATH_FEATURES]
    fieldnames.extend(script_path_names)
    with open(outpath, 'w') as f_out:
        writer = DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()
        for page_tuple in sample:
            row = {}
            row['file'] = page_tuple[0]
            row['sponsored'] = page_tuple[1]
            page = page_tuple[2]
            row['tag_ct'] = len(page.select('*'))
            row['head_tag_ct'] = len(page.select('head *'))
            row['body_tag_ct'] = len(page.select('body *'))
            row['head_script'] = len(page.select('head script'))
            row['body_script'] = len(page.select('body script'))
            row['head_style'] = len(page.select('head style'))
            row['body_style'] = len(page.select('body style'))
            row['head_link'] = len(page.select('head link'))
            row['body_link'] = len(page.select('body link'))
            add_bare_tags(row, page)
            add_tag_attr_vals(row, page)
            add_tag_attr(row, page)
            text_features(row, page)
            script_features(row, page)
            writer.writerow(row)
    finish = datetime.now()
    print 'Elapsed time: %d sec.' % (finish - start).seconds
def get_counts(sample_base):
    """
  Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, 
  tag-attribute-value tuples, urls, paths, and tokens from script and 
  style tags for every file in the sample.
  
  Args:
    sample_base - a bare sample name e.g sample20_20, which would read
        artifact/sample20_20.pkl
  Returns:
    a dict of Counter like {'type of thing': {'thing': count of thing}}
  """
    sample_dict = artifacts.get_artifact(sample_base)
    sample = zip_io.generate_sample(sample_dict)

    tags = Counter()
    bigrams = Counter()
    attrs = Counter()
    tag_attrs = Counter()
    tag_attr_vals = Counter()
    urls = Counter()
    paths = Counter()
    script = Counter()
    style = Counter()
    ctrs = [tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script, style]

    for (k, page_tuple) in enumerate(sample):
        page = page_tuple[2]

        page_tags = set()
        page_bigrams = set()
        page_attrs = set()
        page_tag_attrs = set()
        page_tag_attr_vals = set()
        page_urls = set()
        page_paths = set()
        page_script = set()
        page_style = set()

        for tag in page.find_all(True):
            page_tags.add(tag.name)
            for child in tag.find_all(True, recursive=False):
                key = tag.name + "_" + child.name
                page_bigrams.add(key)
            for a in tag.attrs:
                page_attrs.add(a)
                key = tag.name + "_" + a
                page_tag_attrs.add(key)
                key = key + "_" + unicode(tag.attrs[a])
                page_tag_attr_vals.add(key)
            if tag.name == "script":
                script_tokens = re.findall("\W(\w\w+)\W", tag.get_text())
                for tok in script_tokens:
                    page_script.add(tok)
            if tag.name == "style":
                style_tokens = re.findall("\W(\w\w+)\W", tag.get_text())
                for tok in style_tokens:
                    page_style.add(tok)

        srcs = page.select("[src]")
        hrefs = page.select("[href]")
        all_urls = [tag["src"] for tag in srcs]
        all_urls.extend([tag["href"] for tag in hrefs])
        all_web = []
        all_paths = []
        for u in all_urls:
            try:
                all_web.append(urlparse(u).netloc)
                all_paths.append(urlparse(u).path)
            except ValueError:
                pass
        page_urls = set(all_web)
        page_paths = set(all_paths)

        for key in page_urls:
            urls[key] += 1
        for key in page_paths:
            paths[key] += 1
        for key in page_tags:
            tags[key] += 1
        for key in page_bigrams:
            bigrams[key] += 1
        for key in page_attrs:
            attrs[key] += 1
        for key in page_tag_attrs:
            tag_attrs[key] += 1
        for key in page_tag_attr_vals:
            tag_attr_vals[key] += 1
        for key in page_script:
            script[key] += 1
        for key in page_style:
            style[key] += 1

        if (k + 1) % 1000 == 0:
            for ctr in ctrs:
                for key in ctr.keys():
                    if ctr[key] == 1:
                        del ctr[key]

    out = {
        "tags": tags,
        "bigrams": bigrams,
        "attrs": attrs,
        "tag_attrs": tag_attrs,
        "tag_attr_vals": tag_attr_vals,
        "urls": urls,
        "paths": paths,
        "script": script,
        "style": style,
    }

    return out
Exemple #10
0
def get_counts(sample_base):
    '''
  Collect counts of tags, tag bigrams, attributes, tag-attribute pairs, 
  tag-attribute-value tuples, urls, paths, and tokens from script and 
  style tags for every file in the sample.
  
  Args:
    sample_base - a bare sample name e.g sample20_20, which would read
        artifact/sample20_20.pkl
  Returns:
    a dict of Counter like {'type of thing': {'thing': count of thing}}
  '''
    sample_dict = artifacts.get_artifact(sample_base)
    sample = zip_io.generate_sample(sample_dict)

    tags = Counter()
    bigrams = Counter()
    attrs = Counter()
    tag_attrs = Counter()
    tag_attr_vals = Counter()
    urls = Counter()
    paths = Counter()
    script = Counter()
    style = Counter()
    ctrs = [
        tags, bigrams, attrs, tag_attrs, tag_attr_vals, urls, paths, script,
        style
    ]

    for (k, page_tuple) in enumerate(sample):
        page = page_tuple[2]

        page_tags = set()
        page_bigrams = set()
        page_attrs = set()
        page_tag_attrs = set()
        page_tag_attr_vals = set()
        page_urls = set()
        page_paths = set()
        page_script = set()
        page_style = set()

        for tag in page.find_all(True):
            page_tags.add(tag.name)
            for child in tag.find_all(True, recursive=False):
                key = tag.name + '_' + child.name
                page_bigrams.add(key)
            for a in tag.attrs:
                page_attrs.add(a)
                key = tag.name + '_' + a
                page_tag_attrs.add(key)
                key = key + '_' + unicode(tag.attrs[a])
                page_tag_attr_vals.add(key)
            if tag.name == 'script':
                script_tokens = re.findall('\W(\w\w+)\W', tag.get_text())
                for tok in script_tokens:
                    page_script.add(tok)
            if tag.name == 'style':
                style_tokens = re.findall('\W(\w\w+)\W', tag.get_text())
                for tok in style_tokens:
                    page_style.add(tok)

        srcs = page.select('[src]')
        hrefs = page.select('[href]')
        all_urls = [tag['src'] for tag in srcs]
        all_urls.extend([tag['href'] for tag in hrefs])
        all_web = []
        all_paths = []
        for u in all_urls:
            try:
                all_web.append(urlparse(u).netloc)
                all_paths.append(urlparse(u).path)
            except ValueError:
                pass
        page_urls = set(all_web)
        page_paths = set(all_paths)

        for key in page_urls:
            urls[key] += 1
        for key in page_paths:
            paths[key] += 1
        for key in page_tags:
            tags[key] += 1
        for key in page_bigrams:
            bigrams[key] += 1
        for key in page_attrs:
            attrs[key] += 1
        for key in page_tag_attrs:
            tag_attrs[key] += 1
        for key in page_tag_attr_vals:
            tag_attr_vals[key] += 1
        for key in page_script:
            script[key] += 1
        for key in page_style:
            style[key] += 1

        if (k + 1) % 1000 == 0:
            for ctr in ctrs:
                for key in ctr.keys():
                    if ctr[key] == 1:
                        del ctr[key]

    out = {
        'tags': tags,
        'bigrams': bigrams,
        'attrs': attrs,
        'tag_attrs': tag_attrs,
        'tag_attr_vals': tag_attr_vals,
        'urls': urls,
        'paths': paths,
        'script': script,
        'style': style
    }

    return out