def test_totext(): # exercise function table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) f = NamedTemporaryFile(delete=False) prologue = """{| class="wikitable" |- ! foo ! bar """ template = """|- | {foo} | {bar} """ epilogue = "|}" totext(table, f.name, template, prologue, epilogue) # check what it did with open(f.name, 'rb') as o: actual = o.read() expect = """{| class="wikitable" |- ! foo ! bar |- | a | 1 |- | b | 2 |- | c | 2 |}""" eq_(expect, actual)
def test_totext(): # exercise function table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) f = NamedTemporaryFile(delete=False) prologue = """{| class="wikitable" |- ! foo ! bar """ template = """|- | {foo} | {bar} """ epilogue = "|}" totext(table, f.name, template, prologue, epilogue) # check what it did with open(f.name, "rb") as o: actual = o.read() expect = """{| class="wikitable" |- ! foo ! bar |- | a | 1 |- | b | 2 |- | c | 2 |}""" assertequal(expect, actual)
def execute(args): """Prints a JSON array of `Relationship` objects to stdout Args: args: `argparse` parsed arguments """ uri = urlparse(args.uri) service = Service(uri, args.versioned) table = service.get_relationships() etl.totext(table, source=None, encoding='utf8', template='{relationship}\n')
def test_totext_gz(): # exercise function table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) f = NamedTemporaryFile(delete=False) fn = f.name + '.gz' f.close() os.rename(f.name, fn) prologue = """{| class="wikitable" |- ! foo ! bar """ template = """|- | {foo} | {bar} """ epilogue = "|}" totext(table, fn, template, prologue, epilogue) # check what it did o = gzip.open(fn, 'rb') try: actual = o.read() expect = """{| class="wikitable" |- ! foo ! bar |- | a | 1 |- | b | 2 |- | c | 2 |}""" eq_(expect, actual) finally: o.close()
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") # Only request seed ids if < 20. If use too many, will cause problems calling API. # 20 is an arbitrary number warc_paths = self._get_warc_paths( collection_id, seed_ids if len(seed_ids) <= 20 else None, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"] ) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"] ) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format( tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format( base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix( filepath, prefix= "<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append( Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append( Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message( STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result)
def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"]) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"]) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result)
f.write(text) table1 = etl.fromtext('example.txt') table1 # post-process, e.g., with capture() table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar']) table2 # totext() ########## import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] prologue = '''{| class="wikitable" |- ! foo ! bar ''' template = '''|- | {foo} | {bar} ''' epilogue = '|}' etl.totext(table1, 'example.txt', template, prologue, epilogue) # see what we did print(open('example.txt').read())