Example #1
0
def get_transform():
    w = dw.DataWrangler()

    # Split data repeatedly on newline  into  rows
    w.add(
        dw.Split(column=["data"],
                 table=0,
                 status="active",
                 drop=True,
                 result="row",
                 update=False,
                 insert_position="right",
                 row=None,
                 on="\n",
                 before=None,
                 after=None,
                 ignore_between=None,
                 which=1,
                 max=0,
                 positions=None,
                 quote_character=None))

    # Delete empty rows
    w.add(
        dw.Filter(column=[],
                  table=0,
                  status="active",
                  drop=False,
                  row=dw.Row(column=[],
                             table=0,
                             status="active",
                             drop=False,
                             conditions=[
                                 dw.Empty(column=[],
                                          table=0,
                                          status="active",
                                          drop=False,
                                          percent_valid=0,
                                          num_valid=0)
                             ])))

    # Delete  rows where data starts with '==='
    w.add(
        dw.Filter(column=[],
                  table=0,
                  status="active",
                  drop=False,
                  row=dw.Row(column=[],
                             table=0,
                             status="active",
                             drop=False,
                             conditions=[
                                 dw.StartsWith(column=[],
                                               table=0,
                                               status="active",
                                               drop=False,
                                               lcol="data",
                                               value="===",
                                               op_str="starts with")
                             ])))

    # Delete  rows where data = '<!-- KBDX was Broadus Airport ...
    w.add(
        dw.Filter(
            column=[],
            table=0,
            status="active",
            drop=False,
            row=dw.Row(
                column=[],
                table=0,
                status="active",
                drop=False,
                conditions=[
                    dw.
                    Eq(column=[],
                       table=0,
                       status="active",
                       drop=False,
                       lcol="data",
                       value=
                       "<!-- KBDX was Broadus Airport in Broadus, Montana. Replaced by new airport with FAA ID: 00F -->",
                       op_str="=")
                ])))

    # Delete  rows where data contains '<s>''''
    w.add(
        dw.Filter(column=[],
                  table=0,
                  status="active",
                  drop=False,
                  row=dw.Row(column=[],
                             table=0,
                             status="active",
                             drop=False,
                             conditions=[
                                 dw.Contains(column=[],
                                             table=0,
                                             status="active",
                                             drop=False,
                                             lcol="data",
                                             value="<s>'''",
                                             op_str="contains")
                             ])))

    # Extract from data between positions 5, 9
    w.add(
        dw.Extract(column=["data"],
                   table=0,
                   status="active",
                   drop=False,
                   result="column",
                   update=False,
                   insert_position="right",
                   row=None,
                   on=None,
                   before=None,
                   after=None,
                   ignore_between=None,
                   which=1,
                   max=1,
                   positions=[5, 9]))

    # Drop data
    w.add(dw.Drop(column=["data"], table=0, status="active", drop=True))

    return w
Example #2
0
from wrangler import dw

w = dw.DataWrangler()

# Split data repeatedly on newline  into  rows
w.add(dw.Split(column=["data"],
               table=0,
               status="active",
               drop=True,
               result="row",
               update=False,
               insert_position="right",
               row=None,
               on="\n",
               before=None,
               after=None,
               ignore_between=None,
               which=1,
               max=0,
               positions=None,
               quote_character=None))

# Split data repeatedly on ','
w.add(dw.Split(column=["data"],
               table=0,
               status="active",
               drop=True,
               result="column",
               update=False,
               insert_position="right",
               row=None,
Example #3
0
    sys.exit(
        'Error: Please include an input and output file.  Example python script.py input.csv output.csv'
    )

w = dw.DataWrangler()

# Split data repeatedly on '|-'  into  rows
w.add(
    dw.Split(column=["data"],
             table=0,
             status="active",
             drop=True,
             result="row",
             update=False,
             insert_position="right",
             row=None,
             on="\\|-",
             before=None,
             after=None,
             ignore_between=None,
             which=1,
             max=0,
             positions=None,
             quote_character=None))

# Cut from data on '| any lowercase word =#FFF any number  any word \|'
w.add(
    dw.Cut(column=["data"],
           table=0,
           status="active",
           drop=False,
           result="column",