예제 #1
0
파일: sources.py 프로젝트: bigmlcom/bigmler
def multi_label_expansion(training_set, training_set_header,
                          args, output_path,
                          labels=None, session_file=None, input_flag=False):
    """Splitting the labels in a multi-label objective field to create
       a source with column per label

    """
    objective_field = args.objective_field
    input_reader = TrainReader(training_set, training_set_header,
                               objective_field, multi_label=True,
                               labels=labels,
                               label_separator=args.label_separator,
                               training_separator=args.training_separator,
                               multi_label_fields=args.multi_label_fields_list,
                               label_aggregates=args.label_aggregates_list,
                               objective=not input_flag)
    # read file to get all the different labels if no --labels flag is given
    # or use labels given in --labels and generate the new field names
    new_headers = input_reader.get_label_headers()

    try:
        file_name = os.path.basename(training_set)
    except AttributeError:
        file_name = "test_set.csv" if input_flag else "training_set.csv"
    output_file = "%s%sextended_%s" % (output_path, os.sep, file_name)
    message = u.dated("Transforming to extended source.\n")
    u.log_message(message, log_file=session_file,
                  console=args.verbosity)
    with open(output_file, u.open_mode('w')) as output_handler:
        output = csv.writer(output_handler, lineterminator="\n")
        output.writerow(new_headers)
        # read to write new source file with column per label
        input_reader.reset()
        if training_set_header:
            input_reader.get_next()
        while True:
            try:
                row = input_reader.get_next(extended=True)
                output.writerow(row)
            except StopIteration:
                break

    # training sources are zipped to minimize upload time and resources
    if not input_flag:
        output_file_zip = "%s%sextended_%s.zip" % (output_path,
                                                   os.sep, file_name)
        with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file:
            output_zipped_file.write(output_file, file_name)
        output_file = output_file_zip
        objective_field = input_reader.headers[input_reader.objective_column]

    input_reader.close()
    return (output_file, input_reader.get_multi_label_data())
예제 #2
0
파일: sources.py 프로젝트: osroca/bigmler
def multi_label_expansion(training_set,
                          training_set_header,
                          args,
                          output_path,
                          labels=None,
                          session_file=None,
                          input_flag=False):
    """Splitting the labels in a multi-label objective field to create
       a source with column per label

    """
    objective_field = args.objective_field
    input_reader = TrainReader(training_set,
                               training_set_header,
                               objective_field,
                               multi_label=True,
                               labels=labels,
                               label_separator=args.label_separator,
                               training_separator=args.training_separator,
                               multi_label_fields=args.multi_label_fields_list,
                               label_aggregates=args.label_aggregates_list,
                               objective=not input_flag)
    # read file to get all the different labels if no --labels flag is given
    # or use labels given in --labels and generate the new field names
    new_headers = input_reader.get_label_headers()

    try:
        file_name = os.path.basename(training_set)
    except AttributeError:
        file_name = "test_set.csv" if input_flag else "training_set.csv"
    output_file = "%s%sextended_%s" % (output_path, os.sep, file_name)
    message = u.dated("Transforming to extended source.\n")
    u.log_message(message, log_file=session_file, console=args.verbosity)
    with open(output_file, 'w', 0) as output_handler:
        output = csv.writer(output_handler, lineterminator="\n")
        output.writerow(new_headers)
        # read to write new source file with column per label
        input_reader.reset()
        if training_set_header:
            input_reader.next()
        while True:
            try:
                row = input_reader.next(extended=True)
                output.writerow(row)
            except StopIteration:
                break

    # training sources are zipped to minimize upload time and resources
    if not input_flag:
        output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep,
                                                   file_name)
        with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file:
            output_zipped_file.write(output_file, file_name)
        output_file = output_file_zip
        objective_field = input_reader.headers[input_reader.objective_column]

    return (output_file, input_reader.get_multi_label_data())
예제 #3
0
def multi_label_expansion(training_set, training_set_header, objective_field,
                          args, output_path, field_attributes=None,
                          labels=None, session_file=None):
    """Splitting the labels in a multi-label objective field to create
       a source with column per label

    """
    # find out column number corresponding to the objective field
    training_reader = TrainReader(training_set, training_set_header,
                                  objective_field, multi_label=True,
                                  labels=labels,
                                  label_separator=args.label_separator,
                                  training_separator=args.training_separator)
    # read file to get all the different labels if no --labels flag is given
    # or use labels given in --labels and generate the new field names
    new_headers = training_reader.get_headers(objective_field=False)
    new_field_names = [l.get_label_field(training_reader.objective_name, label)
                       for label in training_reader.labels]
    new_headers.extend(new_field_names)
    new_headers.append(training_reader.objective_name)
    new_headers = [header.encode("utf-8") for header in new_headers]
    try:
        file_name = os.path.basename(training_set)
    except AttributeError:
        file_name = "training_set.csv"
    output_file = "%s%sextended_%s" % (output_path, os.sep, file_name)
    message = u.dated("Transforming to extended source.\n")
    u.log_message(message, log_file=session_file,
                  console=args.verbosity)
    with open(output_file, 'w', 0) as output_handler:
        output = csv.writer(output_handler, lineterminator="\n")
        output.writerow(new_headers)
        # read to write new source file with column per label
        training_reader.reset()
        if training_set_header:
            training_reader.next()
        while True:
            try:
                row = training_reader.next(extended=True)
                output.writerow(row)
            except StopIteration:
                break
    objective_field = training_reader.headers[training_reader.objective_column]
    if field_attributes is None:
        field_attributes = {}
    for label_column, label in training_reader.labels_columns():
        field_attributes.update({label_column: {
            "label": "%s%s" % (l.MULTI_LABEL_LABEL, label)}})
    # Setting field label to mark objective and label fields and objective
    # field (just in case it was not set previously and other derived fields
    # are added in the source construction process after the real last field).
    return (output_file, training_reader.labels, field_attributes,
            training_reader.objective_name)