def _df_to_libffm(self, out_file_or_y=None, y=None):
  out_file = out_file_or_y if type(out_file_or_y) is str else None  
  if y is None and out_file_or_y is not None and out_file is None: 
    y = out_file_or_y
  if hasattr(y, 'values'): y = y.values
  
  lines = []    
  outfile = None
  if out_file is not None: outfile = utils.get_write_file_stream(out_file)
  categoricals = self.categoricals() + self.indexes() + self.binaries()
  if len(categoricals) > 0: raise Exception('categoricals not currently supported')
  numericals = self.numericals()
  for idx, row in enumerate(utils.chunked_iterator(self)):
    label = '0' if y is None or idx >= len(y) else str(int(y[idx]))
    new_line = [label]     
    for col_idx, c in enumerate(numericals):
      cis = str(col_idx)
      new_line.append(cis + ':0:' + `0 if row[c] == 0  else row[c]`)
    line = ' '.join(new_line)
    if outfile is not None: outfile.write(line + '\n')
    else: lines.append(line)
  if outfile:
    outfile.close()
    return None
  else: return lines
 def impl(outfile):
   def add_cols(new_line, columns, is_numerical):
     if len(columns) == 0: return
     if tag_feature_sets: new_line.append('|' + ('n' if is_numerical else 'c'))
     for c in columns:        
       val = row[c] 
       if val == 0: continue        
       if not is_numerical:
         name = c + '_' + str(val)
         if output_categorical_value: line = get_col_index(name) + ':1'
         else: 
           line = get_col_index(name)
       else:           
         line = get_col_index(c) + ':' + str(val)
       new_line.append(line)
     
   lines = []  
   for idx, row in enumerate(utils.chunked_iterator(df)):
     label = '1.0' if y is None or idx >= len(y) else str(float(y[idx]))
     if convert_zero_ys and label == '0.0': label = '-1.0'
     if weights is not None and idx < len(weights):      
       w = weights[idx]
       if w != 1: label += ' ' + `w`
       label += ' \'' + `idx`
     
     new_line = []
     add_cols(new_line, df.numericals(), True)
     add_cols(new_line, df.categoricals() + df.indexes() + df.binaries(), False)
     if sort_feature_indexes:
       new_line = sorted(new_line, key=lambda v: int(v.split(':')[0]))
     line = ' '.join([label] + new_line)
 
     if outfile: outfile.write(line + '\n')
     else: lines.append(line)
   return lines
Example #3
0
 def _anonymize_by_obj(self, obj_anonymizer, qs):
     bar = pyprind.ProgBar(qs.count(),
                           title='Anonymize model {}'.format(
                               self._get_full_model_name(qs.model)),
                           stream=ProgressBarStream(self.stdout))
     for obj in chunked_iterator(qs, obj_anonymizer.chunk_size):
         obj_anonymizer().anonymize_obj(obj)
         bar.update()