Beispiel #1
0
def test_totext():

    # exercise function
    table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2))
    f = NamedTemporaryFile(delete=False)
    prologue = """{| class="wikitable"
|-
! foo
! bar
"""
    template = """|-
| {foo}
| {bar}
"""
    epilogue = "|}"
    totext(table, f.name, template, prologue, epilogue)

    # check what it did
    with open(f.name, 'rb') as o:
        actual = o.read()
        expect = """{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 2
|}"""
        eq_(expect, actual)
Beispiel #2
0
def test_totext():

    # exercise function
    table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2))
    f = NamedTemporaryFile(delete=False)
    prologue = """{| class="wikitable"
|-
! foo
! bar
"""
    template = """|-
| {foo}
| {bar}
"""
    epilogue = "|}"
    totext(table, f.name, template, prologue, epilogue)

    # check what it did
    with open(f.name, "rb") as o:
        actual = o.read()
        expect = """{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 2
|}"""
        assertequal(expect, actual)
Beispiel #3
0
    def execute(args):
        """Prints a JSON array of `Relationship` objects to stdout

        Args:
            args: `argparse` parsed arguments
        """
        uri = urlparse(args.uri)
        service = Service(uri, args.versioned)
        table = service.get_relationships()
        etl.totext(table, source=None, encoding='utf8', template='{relationship}\n')
Beispiel #4
0
def test_totext_gz():
    
    # exercise function
    table = (('foo', 'bar'),
             ('a', 1),
             ('b', 2),
             ('c', 2))
    f = NamedTemporaryFile(delete=False)
    fn = f.name + '.gz'
    f.close()
    os.rename(f.name, fn)
    prologue = """{| class="wikitable"
|-
! foo
! bar
"""
    template = """|-
| {foo}
| {bar}
"""
    epilogue = "|}"
    totext(table, fn, template, prologue, epilogue)
    
    # check what it did
    o = gzip.open(fn, 'rb')
    try:
        actual = o.read()
        expect = """{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 2
|}"""
        eq_(expect, actual)
    finally:
        o.close()
Beispiel #5
0
def xref_symbol_reports():
    symbol_reports = [
        f for f in os.listdir()
        if re.match('OCLC Datasync Unresolved.*\.csv', f)
    ]

    today = str(date.today())

    for report in symbol_reports:

        symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report)
        symbol = symbol_split[1]
        xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx'
        xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls'
        txt_outfile = symbol + '_staging_OCNs_' + today + '.txt'

        symbol_table_raw = etl.fromcsv(report, encoding='utf-8')
        symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID')
        symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None")
        symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID')

        xref_table = etl.fromcsv('unresxref.csv')
        xref_table2 = etl.select(xref_table, "{MMS ID} is not None")
        xref_table_sorted = etl.sort(xref_table2, 'MMS ID')

        symbol_xref_table = etl.join(symbol_table_sorted,
                                     xref_table_sorted,
                                     presorted=True,
                                     lkey="MMS ID",
                                     rkey="MMS ID")

        try:
            etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8')
        except TypeError:
            etl.toxls(symbol_xref_table,
                      xls_outfile,
                      'Sheet1',
                      encoding='utf-8')

        staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN')
        template = '{Staging OCN}\n'
        etl.totext(staging_ocns_table, txt_outfile, template=template)
Beispiel #6
0
def test_totext_gz():

    # exercise function
    table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2))
    f = NamedTemporaryFile(delete=False)
    fn = f.name + '.gz'
    f.close()
    os.rename(f.name, fn)
    prologue = """{| class="wikitable"
|-
! foo
! bar
"""
    template = """|-
| {foo}
| {bar}
"""
    epilogue = "|}"
    totext(table, fn, template, prologue, epilogue)

    # check what it did
    o = gzip.open(fn, 'rb')
    try:
        actual = o.read()
        expect = """{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 2
|}"""
        eq_(expect, actual)
    finally:
        o.close()
Beispiel #7
0
def test_totext():
    
    # exercise function
    table = (('foo', 'bar'),
             ('a', 1),
             ('b', 2),
             ('c', 2))
    f = NamedTemporaryFile(delete=False)
    prologue = """{| class="wikitable"
|-
! foo
! bar
"""
    template = """|-
| {foo}
| {bar}
"""
    epilogue = "|}"
    totext(table, f.name, template, prologue, epilogue)
    
    # check what it did
    with open(f.name, 'rb') as o:
        actual = o.read()
        expect = """{| class="wikitable"
|-
! foo
! bar
|-
| a
| 1
|-
| b
| 2
|-
| c
| 2
|}"""
        eq_(expect, actual)
Beispiel #8
0
    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key,
                                    export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            # Only request seed ids if < 20. If use too many, will cause problems calling API.
            # 20 is an arbitrary number
            warc_paths = self._get_warc_paths(
                collection_id, seed_ids if len(seed_ids) <= 20 else None,
                harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]
            ) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]
            ) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe,
                                           item_date_start, item_date_end,
                                           seed_uids, export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath,
                                                      str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table,
                                    filepath,
                                    template="{{{}}}\n".format(
                                        tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(
                            base_filepath,
                            str(idx + 1).zfill(3),
                            export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(
                                filepath,
                                prefix=
                                "<html><head><meta charset='utf-8'></head>\n",
                                suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT,
                            "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(
                    Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(
                Msg(CODE_BAD_REQUEST,
                    "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(
            STATUS_SUCCESS if self.result.success else STATUS_FAILURE,
            self.routing_key, export_id, self.result)
Beispiel #9
0
    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids,
                                           export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3),
                                                     export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n",
                                           suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key,
                                    export_id, self.result)
Beispiel #10
0
    f.write(text)

table1 = etl.fromtext('example.txt')
table1
# post-process, e.g., with capture()
table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar'])
table2


# totext()
##########

import petl as etl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 2]]
prologue = '''{| class="wikitable"
|-
! foo
! bar
'''
template = '''|-
| {foo}
| {bar}
'''
epilogue = '|}'
etl.totext(table1, 'example.txt', template, prologue, epilogue)
# see what we did
print(open('example.txt').read())