def multi_label_expansion(training_set, training_set_header, args, output_path, labels=None, session_file=None, input_flag=False): """Splitting the labels in a multi-label objective field to create a source with column per label """ objective_field = args.objective_field input_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator, multi_label_fields=args.multi_label_fields_list, label_aggregates=args.label_aggregates_list, objective=not input_flag) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = input_reader.get_label_headers() try: file_name = os.path.basename(training_set) except AttributeError: file_name = "test_set.csv" if input_flag else "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, u.open_mode('w')) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label input_reader.reset() if training_set_header: input_reader.get_next() while True: try: row = input_reader.get_next(extended=True) output.writerow(row) except StopIteration: break # training sources are zipped to minimize upload time and resources if not input_flag: output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep, file_name) with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file: output_zipped_file.write(output_file, file_name) output_file = output_file_zip objective_field = input_reader.headers[input_reader.objective_column] input_reader.close() return (output_file, input_reader.get_multi_label_data())
def multi_label_expansion(training_set, training_set_header, args, output_path, labels=None, session_file=None, input_flag=False): """Splitting the labels in a multi-label objective field to create a source with column per label """ objective_field = args.objective_field input_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator, multi_label_fields=args.multi_label_fields_list, label_aggregates=args.label_aggregates_list, objective=not input_flag) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = input_reader.get_label_headers() try: file_name = os.path.basename(training_set) except AttributeError: file_name = "test_set.csv" if input_flag else "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, 'w', 0) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label input_reader.reset() if training_set_header: input_reader.next() while True: try: row = input_reader.next(extended=True) output.writerow(row) except StopIteration: break # training sources are zipped to minimize upload time and resources if not input_flag: output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep, file_name) with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file: output_zipped_file.write(output_file, file_name) output_file = output_file_zip objective_field = input_reader.headers[input_reader.objective_column] return (output_file, input_reader.get_multi_label_data())
def multi_label_expansion(training_set, training_set_header, objective_field, args, output_path, field_attributes=None, labels=None, session_file=None): """Splitting the labels in a multi-label objective field to create a source with column per label """ # find out column number corresponding to the objective field training_reader = TrainReader(training_set, training_set_header, objective_field, multi_label=True, labels=labels, label_separator=args.label_separator, training_separator=args.training_separator) # read file to get all the different labels if no --labels flag is given # or use labels given in --labels and generate the new field names new_headers = training_reader.get_headers(objective_field=False) new_field_names = [l.get_label_field(training_reader.objective_name, label) for label in training_reader.labels] new_headers.extend(new_field_names) new_headers.append(training_reader.objective_name) new_headers = [header.encode("utf-8") for header in new_headers] try: file_name = os.path.basename(training_set) except AttributeError: file_name = "training_set.csv" output_file = "%s%sextended_%s" % (output_path, os.sep, file_name) message = u.dated("Transforming to extended source.\n") u.log_message(message, log_file=session_file, console=args.verbosity) with open(output_file, 'w', 0) as output_handler: output = csv.writer(output_handler, lineterminator="\n") output.writerow(new_headers) # read to write new source file with column per label training_reader.reset() if training_set_header: training_reader.next() while True: try: row = training_reader.next(extended=True) output.writerow(row) except StopIteration: break objective_field = training_reader.headers[training_reader.objective_column] if field_attributes is None: field_attributes = {} for label_column, label in training_reader.labels_columns(): field_attributes.update({label_column: { "label": "%s%s" % (l.MULTI_LABEL_LABEL, label)}}) # Setting field label to mark objective and label fields and objective # field (just in case it was not set previously and other derived fields # are added in the source construction process after the real last field). return (output_file, training_reader.labels, field_attributes, training_reader.objective_name)