def generateManifest(syn, allFiles, filename): """Generates a manifest file based on a list of entities objects. :param allFiles: A list of File Entities :param filename: file where manifest will be written """ keys = ['path', 'parent', 'name', 'synapseStore', 'contentType', 'used', 'executed', 'activityName', 'activityDescription'] annotKeys = set() data = [] for entity in allFiles: row = {'parent': entity['parentId'], 'path': entity.path, 'name': entity.name, 'synapseStore': entity.synapseStore, 'contentType': allFiles[0]['contentType']} row.update({key:val[0] for key, val in entity.annotations.items()}) annotKeys.update(set(entity.annotations.keys())) try: prov = syn.getProvenance(entity) row['used'] = ';'.join(prov._getUsedStringList()) row['executed'] = ';'.join(prov._getExecutedStringList()) row['activityName'] = prov.get('name', '') row['activityDescription'] = prov.get('description', '') except SynapseHTTPError: pass # No provenance present data.append(row) keys.extend(annotKeys) with open(filename, 'w') as fp: csvWriter = csv.DictWriter(fp, keys, restval='', extrasaction='ignore', delimiter='\t') csvWriter.writeheader() for row in data: csvWriter.writerow(row)
def write_to_csv(self): if self.num_results > 0: self.num_results = sum(1 for line in codecs.open(self.tmp_file, mode='r', encoding='utf-8')) if self.num_results > 0: output_file = codecs.open(self.opts.output_file, mode='a', encoding='utf-8') csv_writer = csv.DictWriter(output_file, fieldnames=self.csv_headers) csv_writer.writeheader() timer = 0 widgets = ['Write to csv ', progressbar.Bar(left='[', marker='#', right=']'), progressbar.FormatLabel(' [%(value)i/%(max)i] ['), progressbar.Percentage(), progressbar.FormatLabel('] [%(elapsed)s] ['), progressbar.ETA(), '] [', progressbar.FileTransferSpeed(unit='lines'), ']' ] bar = progressbar.ProgressBar(widgets=widgets, maxval=self.num_results).start() for line in codecs.open(self.tmp_file, mode='r', encoding='utf-8'): timer += 1 bar.update(timer) csv_writer.writerow(json.loads(line)) output_file.close() bar.finish() else: print('There is no docs with selected field(s): {}.'.format(','.join(self.opts.fields))) os.remove(self.tmp_file)
def getoutput(self): output = csv.StringIO() writer = csv.DictWriter(output, self.fieldnames, extrasaction='ignore', dialect=self.dialect) writer.writeheader() for ce in self.units: writer.writerow(ce.todict()) return output.getvalue()
def export(self): ''' Export all data into a CSV file. Will overwrite existing file. ''' with open(self.__csv_location, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, self.__csv_fields) writer.writeheader() for day in self.__db.all(): writer.writerow(day)
def create_manifest(self): self.create_remote_path() total_folders = len(self._folders) folder_name_padding = len(str(total_folders)) if folder_name_padding < 2: folder_name_padding = 2 folder_num = 0 filename = 'manifest.tsv' keys = ['path', 'parent', 'name', 'forceVersion'] + \ FileMetadataWorker.DICOM_ANNOTATION_FIELDS.keys() with io.open(filename, 'w', encoding='utf8') as fp: csvWriter = csv.DictWriter(fp, keys, restval='', extrasaction='ignore', delimiter=u'\t') csvWriter.writeheader() for files in self._folders: folder_num += 1 folder_path = (self._remote_path or '') if total_folders > 1: folder_name = str(folder_num).zfill(folder_name_padding) folder_path = os.path.join(folder_path, folder_name) self.create_folder_in_synapse(folder_path) for file_info in files: file_name = file_info["calculated_name"] file_full_local_path = file_info['full_path'] full_synapse_path, synapse_parent, _ = self.to_synapse_path( os.path.join(folder_path, file_name)) logging.info('{0} -> {1}'.format(file_full_local_path, full_synapse_path)) row = { "path": file_full_local_path, "parent": synapse_parent.id, "forceVersion": True, "name": file_name } for field_name in FileMetadataWorker.DICOM_ANNOTATION_FIELDS.keys( ): row[field_name] = file_info.get('annotations', {}).get(field_name) csvWriter.writerow(row) logging.info('Manifest written to: {0}'.format(filename))
def _write_manifest_data(filename, keys, data): with io.open(filename, 'w', encoding='utf8') as fp: csvWriter = csv.DictWriter(fp, keys, restval='', extrasaction='ignore', delimiter='\t') csvWriter.writeheader() for row in data: csvWriter.writerow(row)
def test_write_simple_dict(self): with TemporaryFile("w+", newline='') as fileobj: writer = csv.DictWriter(fileobj, fieldnames=["f1", "f2", "f3"]) writer.writeheader() fileobj.seek(0) self.assertEqual(fileobj.readline(), "f1,f2,f3\r\n") writer.writerow({"f1": 10, "f3": "abc"}) fileobj.seek(0) fileobj.readline() # header self.assertEqual(fileobj.read(), "10,,abc\r\n")
def test_write_fields_not_in_fieldnames(self): with TemporaryFile("w+", newline='') as fileobj: writer = csv.DictWriter(fileobj, fieldnames=["f1", "f2", "f3"]) # Of special note is the non-string key (issue 19449) with self.assertRaises(ValueError) as cx: writer.writerow({"f4": 10, "f2": "spam", 1: "abc"}) exception = str(cx.exception) self.assertIn("fieldnames", exception) self.assertIn("'f4'", exception) self.assertNotIn("'f2'", exception) self.assertIn("1", exception)
def writeDictionaryToCSV(dict, fileName): with io.open(fileName, 'w', encoding="utf-8") as csv_file: fieldNames = HEADER_CSV_KEYS_ORDER writer = csv.DictWriter(csv_file, fieldnames = fieldNames) writer.writeheader() for key in dict.keys(): # Mapping of (POSTNAME, [VOTES]) votesArr = dict[key] print("writing",votesArr) for vote in votesArr: #encodedRow = [v.decode('utf8') if isinstance(v, str) else v for v in vote] writer.writerow(vote)
def serialize(self, out): output = csv.StringIO() writer = csv.DictWriter(output, FIELDNAMES, dialect="catkeys") # No real headers, the first line contains metadata writer.writerow( dict( zip(FIELDNAMES, [ self.header._header_dict[key] for key in FIELDNAMES_HEADER ]))) for unit in self.units: writer.writerow(unit.dict) out.write(output.getvalue().encode(self.encoding))
def serialize(self, out): # Check first if there is at least one translated unit translated_units = [u for u in self.units if u.istranslated()] if not translated_units: return output = csv.StringIO() writer = csv.DictWriter(output, fieldnames=OMEGAT_FIELDNAMES, dialect="omegat") for unit in translated_units: writer.writerow(unit.dict) out.write(output.getvalue().encode(self.encoding))
def serialize(self, out): # Check first if there is at least one translated unit translated_units = [u for u in self.units if u.istranslated()] if not translated_units: return output = csv.StringIO() writer = csv.DictWriter(output, fieldnames=self._fieldnames, dialect="utx") for unit in translated_units: writer.writerow(unit.dict) result = output.getvalue().encode(self.encoding) out.write(self._write_header().encode(self.encoding)) out.write(result)
def serialize(self, out): # Check first if there is at least one translated unit translated_units = [u for u in self.units if u.istranslated()] if not translated_units: return output = csv.StringIO() writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast") # No real headers, the first line contains metadata self.header.tucount = len(translated_units) writer.writerow(dict(zip(WF_FIELDNAMES, [self.header.header[key] for key in WF_FIELDNAMES_HEADER]))) for unit in translated_units: writer.writerow(unit.dict) out.write(output.getvalue().encode(self.encoding))
def test_write_multiple_dict_rows(self): fileobj = StringIO() writer = csv.DictWriter(fileobj, fieldnames=["f1", "f2", "f3"]) writer.writeheader() self.assertEqual(fileobj.getvalue(), "f1,f2,f3\r\n") writer.writerows([{ "f1": 1, "f2": "abc", "f3": "f" }, { "f1": 2, "f2": 5, "f3": "xyz" }]) self.assertEqual(fileobj.getvalue(), "f1,f2,f3\r\n1,abc,f\r\n2,5,xyz\r\n")
def write_to_csv(self): if self.num_results > 0: self.num_results = sum(1 for line in codecs.open(self.tmp_file, mode='r', encoding='utf-8')) if self.num_results > 0: output_file = codecs.open(self.output_file, mode='a', encoding='utf-8') csv_writer = csv.DictWriter(output_file, fieldnames=self.csv_headers) csv_writer.writeheader() timer = 0 for line in codecs.open(self.tmp_file, mode='r', encoding='utf-8'): timer += 1 csv_writer.writerow(json.loads(line)) output_file.close() else: print('There is no docs with selected field(s): {}.'.format(','.join(self.opts['fields']))) os.remove(self.tmp_file)
def write_to_csv(self): csv.register_dialect(name=u"export", delimiter=self.opts.delimiter.decode('utf-8'), quotechar=self.opts.quotechar.decode('utf-8'), doublequote=True, skipinitialspace=False, lineterminator=u'\r\n', quoting=csv.QUOTE_ALL) if self.num_results > 0: self.num_results = sum(1 for line in codecs.open( self.tmp_file, mode='r', encoding='utf-8')) if self.num_results > 0: output_file = codecs.open(self.opts.output_file, mode='a', encoding='utf-8') csv_writer = csv.DictWriter(output_file, fieldnames=self.csv_headers, dialect="export") csv_writer.writeheader() timer = 0 widgets = [ 'Write to csv ', progressbar.Bar(left='[', marker='#', right=']'), progressbar.FormatLabel(' [%(value)i/%(max)i] ['), progressbar.Percentage(), progressbar.FormatLabel('] [%(elapsed)s] ['), progressbar.ETA(), '] [', progressbar.FileTransferSpeed(unit='lines'), ']' ] bar = progressbar.ProgressBar(widgets=widgets, maxval=self.num_results).start() for line in codecs.open(self.tmp_file, mode='r', encoding='utf-8'): timer += 1 bar.update(timer) csv_writer.writerow(json.loads(line)) output_file.close() bar.finish() else: print('There is no docs with selected field(s): {}.'.format( ','.join(self.opts.fields))) os.remove(self.tmp_file)
def anonymize_file(source, dest, csvheaderformatdict=None, ignorementions=False): print('Reading from [{0}] and writing anonymized data to [{1}]...'.format( source, dest)) with io.open(source, 'r', encoding='utf8') as f: #with io.open(source, 'r') as f: with io.open(dest, 'w', encoding='utf8') as o: reader = csv.DictReader(f) if not csvheaderformatdict: fieldnames = consts.defaultHeader else: fieldnames = reader.fieldnames writer = csv.DictWriter(o, fieldnames) writer.writeheader() for row in reader: anonymize_row(row, fieldnames, csvheaderformatdict, ignorementions) writer.writerow(row)
def parse_all_apis(): apis = load_unparsed_api_metadata() total_count = len(apis) a = "{} movie metadata were loaded!".format(total_count) print(a) with open("api_data.csv", "w") as f: header_was_written = False for i, api in enumerate(apis): #b = "Processing {} of {}: {}".format(i + 1, total_count, api['api_name']) #print(b) parsed_api = parse_one_api_metadata(api) w = csv.DictWriter(f, parsed_api.keys()) if not header_was_written: w.writeheader() header_was_written = True try: w.writerow(parsed_api) except UnicodeEncodeError: print(parsed_api)
def people_to_qualtrics_csv(hub, repo_tools_data, frequency, update): """ Print out a formatted file as expected by Qualtrics import. """ if update is not None: with open(update, newline='', encoding='utf-8') as update_data: reader = csv.DictReader(update_data) initial = {row[EMAIL]: row for row in reader} fields = [field for field in reader.fieldnames if field] else: initial = {} fields = [NAME, EMAIL, WEEK, ASSOCIATED_WITH, UNSUBSCRIBED] csv_writer = csv.DictWriter(click.get_text_stream('stdout'), fieldnames=fields, extrasaction='ignore') csv_writer.writeheader() for username, person in repo_tools_data.people.iteritems(): if person.email is None: continue hashdigest = hashlib.md5(person.email.lower()).hexdigest() row = initial.get(person.email, {}) row.update({ NAME: person.name, EMAIL: person.email, WEEK: int(hashdigest, 16) % frequency + 1, ASSOCIATED_WITH: 'edX' if person.associated_with('edX', 'ArbiSoft') else 'other', }) if not person.email_ok: row[UNSUBSCRIBED] = 'true' csv_writer.writerow(row)
async def on_ready(self): print('Logged on to discord as {0}!'.format(self.user)) print('Attempting to query discord channel ID {0}...'.format( self.exportChannelID)) channel = self.get_channel(int(self.exportChannelID)) if channel: with io.open(self.exportCSV, 'w', encoding='utf8') as o: print('Exporting anonymized data to [{0}]...'.format( self.exportCSV)) fieldnames = consts.defaultHeader writer = csv.DictWriter(o, fieldnames) writer.writeheader() print('Parsing channel history...') async for message in channel.history( limit=self.exportChannelLimit, after=self.exportChannelAfter): # TODO: Do something with attachments and reactions row = { 'AuthorID': message.author.id, 'Author': format(message.author), 'Date': message.created_at, 'Content': message.content, 'Attachments': len(message.attachments), 'Reactions': len(message.reactions) } # Replace all mentions with name#discriminator format # TODO: Optimize this iteration, also combining it with the anonymization (callback?) for user in message.mentions: row['Content'] = row['Content'].replace( '<@!{0}>'.format(user.id), '@{0}#{1}'.format(user.name, user.discriminator)) anonymize_row(row, fieldnames) writer.writerow(row) # Using await should wait until any previous async finishes print('Disconnecting from discord') await self.logout()
def JLwriteLinkedResults(clustered_pairs, input_1, input_2, output_file, inner_join=False): if not inner_join: raise Exception("Only expected to be used with inner_join.") logging.info('saving unique results to: %s' % output_file) writer = csv.DictWriter(output_file, fieldnames=['roil', 'outside', 'confidence']) writer.writeheader() get_num = lambda x: x.split('|')[1] for record in clustered_pairs: pair, confidence = record outside = get_num(pair[0]) roil = get_num(pair[1]) writer.writerow({ 'roil': roil, 'outside': outside, 'confidence': str(confidence), })
def render(self, data, media_type=None, renderer_context=None): response = renderer_context.get('response', None) if response is not None and response.exception: return None # TODO: Replace return None with commented lines below to get human-readable description in response # # Returning a non-None value in this function, as part of the exception handling code path in Django REST # Framework, generates an HTTP response that Chrome rejects as invalid (ERR_INVALID_RESPONSE). Possible # Django REST Framework bug? # # fieldnames = data.keys() # rows = [data] else: fieldnames = data['fieldnames'] rows = data['rowdicts'] buff = io.StringIO() writer = csv.DictWriter(buff, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) return buff.getvalue().encode(self.charset)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--batchsize", "-b", type=int, default=8) parser.add_argument("input") parser.add_argument("output") args = parser.parse_args() countdown = args.batchsize with io.open(args.input, 'r', encoding='utf-8') as input: books_reader = csv.DictReader(input) with io.open(args.output, 'w', encoding='utf-8') as output: books_writer = csv.DictWriter(output, fieldnames) books_writer.writeheader() for row in books_reader: if countdown > 0 and not row.get('webchecked', None): isbn = str(row.get('ISBN', None)) if len(isbn) == 9: isbn = "0" + isbn if isbn: countdown = countdown - 1 new_isbn = isbnlib.to_isbn13(isbnlib.canonical(isbn)) if new_isbn is None or new_isbn == "": print "Could not canonicalize isbn", isbn else: row['ISBN'] = new_isbn details = None try: details = isbnlib.meta(isbn) except isbnlib.dev._exceptions.NoDataForSelectorError: print "No data for ISBN", isbn, "title", row.get('Title', "Unknown") row['webchecked'] = "No data for ISBN" except isbnlib._exceptions.NotValidISBNError: print "Invalid ISBN", isbn, "for", row['Title'] row['webchecked'] = "Invalid ISBN" except isbnlib.dev._exceptions.ISBNNotConsistentError: print "Inconsistent data for", row['Title'] row['webchecked'] = "Inconsistent ISBN data" if details: if details.get('ISBN-13', "") != "" and row.get('ISBN', "") == "": row['ISBN'] = details['ISBN-13'] if 'Authors' in row: row['Authors'] = row['Authors'].split('/') old_title = row['Title'] web_title = details['Title'] if old_title != web_title: old_canon = canonicalize_title(old_title) web_canon = canonicalize_title(web_title) old_len = len(old_canon) web_len = len(web_canon) if ((web_len > old_len and old_canon in web_canon) or (web_len == old_len and old_canon == web_canon)): print "Title improvement from", old_title, "to", web_title else: print "Title discrepancy:", old_title, "in file,", web_title, "found online" details['Title'] = old_title # don't use 'update', because we don't want to drag in random other fields that dictwriter will then object to for key in fieldnames: if key in details: row[key] = details[key] if 'Authors' in row: row['Authors'] = '/'.join(row['Authors']) row['webchecked'] = "OK" # from https://docs.python.org/2/library/csv.html encoded_row = {k: (v.encode("utf-8") if isinstance(v, basestring) else v) for k,v in row.iteritems()} books_writer.writerow(row)
def test_create_and_update_file_view(): ## Create a folder folder = Folder(str(uuid.uuid4()), parent=project, description='creating a file-view') folder = syn.store(folder) ## Create dummy file with annotations in our folder path = utils.make_bogus_data_file() file_annotations = dict(fileFormat='jpg', dataType='image', artist='Banksy', medium='print', title='Girl With Ballon') schedule_for_cleanup(path) a_file = File(path, parent=folder, annotations=file_annotations) a_file = syn.store(a_file) schedule_for_cleanup(a_file) # Add new columns for the annotations on this file and get their IDs my_added_cols = [ syn.store(synapseclient.Column(name=k, columnType="STRING")) for k in file_annotations.keys() ] my_added_cols_ids = [c['id'] for c in my_added_cols] view_default_ids = [ c['id'] for c in syn._get_default_entity_view_columns('file') ] col_ids = my_added_cols_ids + view_default_ids scopeIds = [folder['id'].lstrip('syn')] ## Create an empty entity-view with defined scope as folder entity_view = EntityViewSchema(name=str(uuid.uuid4()), scopeIds=scopeIds, addDefaultViewColumns=True, addAnnotationColumns=False, type='file', columns=my_added_cols, parent=project) entity_view = syn.store(entity_view) schedule_for_cleanup(entity_view) assert_equals(set(scopeIds), set(entity_view.scopeIds)) assert_equals(set(col_ids), set(entity_view.columnIds)) assert_equals('file', entity_view.type) ## get the current view-schema view = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(view.filepath) view_dict = list( csv.DictReader(io.open(view.filepath, encoding="utf-8", newline=''))) # check that all of the annotations were retrieved from the view assert set(file_annotations.keys()).issubset(set(view_dict[0].keys())) updated_a_file = syn.get(a_file.id, downloadFile=False) # Check that the values are the same as what was set # Both in the view and on the entity itself for k, v in file_annotations.items(): assert_equals(view_dict[0][k], v) assert_equals(updated_a_file.annotations[k][0], v) # Make a change to the view and store view_dict[0]['fileFormat'] = 'PNG' with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp: schedule_for_cleanup(temp.name) temp_filename = temp.name with io.open(temp_filename, mode='w', encoding="utf-8", newline='') as temp_file: dw = csv.DictWriter(temp_file, fieldnames=view_dict[0].keys(), quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep)) dw.writeheader() dw.writerows(view_dict) temp_file.flush() new_view = syn.store(synapseclient.Table(entity_view.id, temp_filename)) new_view_dict = list( csv.DictReader(io.open(temp_filename, encoding="utf-8", newline=''))) assert_equals(new_view_dict[0]['fileFormat'], 'PNG') #query for the change start_time = time.time() new_view_results = syn.tableQuery("select * from %s" % entity_view.id) schedule_for_cleanup(new_view_results.filepath) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) #query until change is seen. while new_view_dict[0]['fileFormat'] != 'PNG': #check timeout assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) #query again new_view_results = syn.tableQuery("select * from %s" % entity_view.id) new_view_dict = list( csv.DictReader( io.open(new_view_results.filepath, encoding="utf-8", newline=''))) #paranoid check assert_equals(new_view_dict[0]['fileFormat'], 'PNG')
def write_header(self, header_columns): self.internal_writer = csv.DictWriter(self.stdout, fieldnames=header_columns, quoting=csv.QUOTE_ALL) self.internal_writer.writeheader()