def write_sample(sample_dict, outfile):
  '''
  Reads data from a sample, or all of the test set, extracts features, 
  and writes the features out in .csv format at path.PROCESSED.
  
  Args:
    sample_dict - None, or a dict like {filename: label} for every file 
        in the sample. If None, runs the test set. Use a dict with the 
        full training set to get training features.
    outfile - just the base, with no path or extension
    
  Writes:
    features in .csv format
  '''
  start = datetime.now()
  outpath = os.path.join(paths.PROCESSED, outfile + '.csv')
  if sample_dict is not None:
    sample = zip_io.generate_sample(sample_dict)
  else:
    sample = zip_io.generate_test()
  fieldnames = ['file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct',
                'head_script', 'body_script', 'head_style', 'body_style',
                'head_link', 'body_link']
  fieldnames.extend(BARE_TAGS)
  tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL]
  fieldnames.extend(tag_attr_val_names)
  tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR]
  fieldnames.extend(tag_attr_names)
  fieldnames.extend(TEXT_NAMES)
  fieldnames.extend(SCRIPT_FEATURES)
  script_url_names = ['script_url_' + url for url in URL_FEATURES]
  fieldnames.extend(script_url_names)
  script_path_names = ['script_path_' + p for p in PATH_FEATURES]
  fieldnames.extend(script_path_names)
  with open(outpath, 'w') as f_out:
    writer = DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()
    for page_tuple in sample:
      row = {}
      row['file'] = page_tuple[0]
      row['sponsored'] = page_tuple[1]
      page = page_tuple[2]
      row['tag_ct'] = len(page.select('*'))
      row['head_tag_ct'] = len(page.select('head *'))
      row['body_tag_ct'] = len(page.select('body *'))
      row['head_script'] = len(page.select('head script'))
      row['body_script'] = len(page.select('body script'))
      row['head_style'] = len(page.select('head style'))
      row['body_style'] = len(page.select('body style'))
      row['head_link'] = len(page.select('head link'))
      row['body_link'] = len(page.select('body link'))
      add_bare_tags(row, page)
      add_tag_attr_vals(row, page)
      add_tag_attr(row, page)
      text_features(row, page)
      script_features(row, page)
      writer.writerow(row)
  finish = datetime.now()
  print 'Elapsed time: %d sec.' % (finish - start).seconds
def test_features(outfile):
  '''
  Reads the test set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_test.csv
    
  Writes:
    features in .csv format
  '''
  test = zip_io.generate_test()
  write_features(test, outfile + '_test')
Example #3
0
def test_features(outfile):
    '''
  Reads the test set, extracts features from it, and writes the features 
  out in .csv format suitable for loading as a Pandas data frame.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>_test.csv
    
  Writes:
    features in .csv format
  '''
    test = zip_io.generate_test()
    write_features(test, outfile + '_test')
Example #4
0
def test_features(outfile):
    '''
  Reads the test set, extracts features from it, and writes the features out
  in LibSVM format.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
    test = zip_io.generate_test()
    # The + '.5' allows the test set to have the same base name as the
    # training data, with base.5 as test and base.0-4 for train.
    write_features(test, outfile + '.5')
def test_features(outfile):
  '''
  Reads the test set, extracts features from it, and writes the features out
  in LibSVM format.
  
  Args:
    outfile - features are written at paths.PROCESSED/<outfile>.libsvm
    
  Writes:
    features in LibSVM format
  '''
  test = zip_io.generate_test()
  # The + '.5' allows the test set to have the same base name as the 
  # training data, with base.5 as test and base.0-4 for train.
  write_features(test, outfile + '.5')
Example #6
0
def write_sample(sample_dict, outfile):
    '''
  Reads data from a sample, or all of the test set, extracts features, 
  and writes the features out in .csv format at path.PROCESSED.
  
  Args:
    sample_dict - None, or a dict like {filename: label} for every file 
        in the sample. If None, runs the test set. Use a dict with the 
        full training set to get training features.
    outfile - just the base, with no path or extension
    
  Writes:
    features in .csv format
  '''
    start = datetime.now()
    outpath = os.path.join(paths.PROCESSED, outfile + '.csv')
    if sample_dict is not None:
        sample = zip_io.generate_sample(sample_dict)
    else:
        sample = zip_io.generate_test()
    fieldnames = [
        'file', 'sponsored', 'tag_ct', 'head_tag_ct', 'body_tag_ct',
        'head_script', 'body_script', 'head_style', 'body_style', 'head_link',
        'body_link'
    ]
    fieldnames.extend(BARE_TAGS)
    tag_attr_val_names = ['_'.join(s.split()) for s in TAG_ATTR_VAL]
    fieldnames.extend(tag_attr_val_names)
    tag_attr_names = ['_'.join(s.split()) for s in TAG_ATTR]
    fieldnames.extend(tag_attr_names)
    fieldnames.extend(TEXT_NAMES)
    fieldnames.extend(SCRIPT_FEATURES)
    script_url_names = ['script_url_' + url for url in URL_FEATURES]
    fieldnames.extend(script_url_names)
    script_path_names = ['script_path_' + p for p in PATH_FEATURES]
    fieldnames.extend(script_path_names)
    with open(outpath, 'w') as f_out:
        writer = DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()
        for page_tuple in sample:
            row = {}
            row['file'] = page_tuple[0]
            row['sponsored'] = page_tuple[1]
            page = page_tuple[2]
            row['tag_ct'] = len(page.select('*'))
            row['head_tag_ct'] = len(page.select('head *'))
            row['body_tag_ct'] = len(page.select('body *'))
            row['head_script'] = len(page.select('head script'))
            row['body_script'] = len(page.select('body script'))
            row['head_style'] = len(page.select('head style'))
            row['body_style'] = len(page.select('body style'))
            row['head_link'] = len(page.select('head link'))
            row['body_link'] = len(page.select('body link'))
            add_bare_tags(row, page)
            add_tag_attr_vals(row, page)
            add_tag_attr(row, page)
            text_features(row, page)
            script_features(row, page)
            writer.writerow(row)
    finish = datetime.now()
    print 'Elapsed time: %d sec.' % (finish - start).seconds