def export_aliases(project, fh): """ Dump a list of all entity names to a CSV file. The table will contain the active name of each entity, and one of the other existing names as an alias. """ writer = DictWriter(fh, ['entity_id', 'schema', 'alias', 'canonical']) writer.writeheader() alias = aliased(Property) canonical = aliased(Property) schema = aliased(Schema) q = db.session.query(alias.value_string.label('alias'), alias.entity_id) q = q.join(Entity) q = q.join(schema) q = q.join(canonical) q = q.filter(Entity.project_id == project.id) q = q.filter(alias.entity_id != None) # noqa q = q.filter(alias.name == 'name') q = q.filter(canonical.name == 'name') q = q.filter(canonical.active == True) # noqa q = q.add_columns(canonical.value_string.label('canonical')) q = q.add_columns(schema.name.label('schema')) for row in q.all(): writer.writerow({ 'entity_id': str(row.entity_id), 'schema': row.schema, 'alias': row.alias, 'canonical': row.canonical })
def from_files(): basedir = "/Users/rikhoekstra/surfdrive/Shared/Documents/NIOD2017/International_MIgration" toread = [fl for fl in os.listdir(basedir)] result = [] for fl in toread: infl = open(os.path.join(basedir, fl), 'rU') txt = infl.read() recs = txt.split("\n\n")[1:] for r in recs: rec = r.split('\n') res = {} for l in rec: item = l.split(' - ') # print item # for item in splitted: # import pdb; pdb.set_trace() if len(item) > 1 and item[0].strip() in [ 'AU', 'TI', 'PY', 'JO' ]: res[item[0].strip()] = item[1].strip() result.append(res) flout = open('wileyrecs.csv', 'w') w = DictWriter(flout, ['AU', 'TI', 'PY', 'JO']) w.writeheader() w.writerows(result) flout.close() print('written: ', flout.name) return result
def export_aliases(project, fh): """ Dump a list of all entity names to a CSV file. The table will contain the active name of each entity, and one of the other existing names as an alias. """ writer = DictWriter(fh, ['entity_id', 'alias', 'canonical']) writer.writeheader() alias = aliased(EntityProperty) canonical = aliased(EntityProperty) q = db.session.query(alias.value_string.label('alias'), alias.entity_id) q = q.join(Entity) q = q.join(canonical) q = q.filter(Entity.project_id==project.id) q = q.filter(alias.entity_id!=None) q = q.filter(alias.name=='name') q = q.filter(canonical.name=='name') q = q.filter(canonical.active==True) q = q.add_columns(canonical.value_string.label('canonical')) for row in q.all(): #if row.alias == row.canonical: # continue writer.writerow({ 'entity_id': str(row.entity_id), 'alias': row.alias, 'canonical': row.canonical })
def main(): prs = argparse.ArgumentParser() prs.add_argument('--count', type=int, default=100) prs.add_argument('file', type=file) args = prs.parse_args() count = args.count assert count > 0 path = os.path.abspath(args.file.name) root, ext = os.path.splitext(path) new_path = '%s_trimmed_%s%s' % (root, count, ext) reader = DictReader(open(path)) new_entries = [] for i in range(count): new_entries.append(next(reader)) with open(new_path, 'w') as new_file: writer = DictWriter(new_file, reader.unicode_fieldnames) writer.writeheader() writer.writerows(new_entries) print open(new_path).read()
def edr_export(self, request): data = [] for rec_id in request.POST.getlist("iswear"): meta_id = request.POST.get("company_%s_id" % rec_id) res = EDRPOU.get(id=meta_id) if res: rec = res.to_dict() if isinstance(rec.get("founders"), list): rec["founders"] = ";;;".join(rec["founders"]) data.append(rec) if not data: self.message_user(request, "Нічого експортувати") return redirect(reverse("admin:edr_search")) fp = StringIO() w = DictWriter(fp, fieldnames=data[0].keys()) w.writeheader() w.writerows(data) payload = fp.getvalue() fp.close() response = HttpResponse(payload, content_type="text/csv") response[ "Content-Disposition"] = "attachment; filename=edr_{:%Y%m%d_%H%M}.csv".format( datetime.datetime.now()) response["Content-Length"] = len(response.content) return response
def find_all_matching_officers(min_percentage=0.75): notable_officers = set() officer_fingerprints = get_all_officer_fingerprints() sys.stderr.write("\nFinding matches...\n") writer = DictWriter(sys.stdout, [ 'Full Name (from persons)', 'officer_id (from npo_officers)' ]) writer.writeheader() for i, data in enumerate(gdocs_persons()): fingerprint = make_fingerprint(data['Full Name']) matching_ids = find_matching_officers( fingerprint, officer_fingerprints, excluded_ids=notable_officers, min_percentage=min_percentage ) for officer_id in matching_ids: writer.writerow({ 'Full Name (from persons)': data['Full Name'], 'officer_id (from npo_officers)': officer_id, }) notable_officers.update(matching_ids) sys.stderr.write("\r%d" % (i + 1)) sys.stderr.flush() sys.stderr.write("\nDone\n")
def handle(self, *args, **options): if len(args) != 2: raise CommandError( 'usage: python manage.py dump_results ' '<template_file_path> ' '<results_csv_file_path>' ) # Get paths from args, and normalize them to absolute paths: template_file_path, results_csv_file_path = map(os.path.abspath, args) try: template = HitTemplate.objects.get(name=template_file_path) except ObjectDoesNotExist: sys.exit('There is no matching <template_file_path>.') completed_hits = template.hit_set.filter(completed=True) if not completed_hits.exists(): sys.exit('There are no completed HITs.') fieldnames, rows = results_data(completed_hits) with open(results_csv_file_path, 'wb') as fh: writer = DictWriter(fh, fieldnames) writer.writeheader() for row in rows: writer.writerow(row)
def find_all_matching_officers(min_percentage=0.75): notable_officers = set() officer_fingerprints = get_all_officer_fingerprints() sys.stderr.write("\nFinding matches...\n") writer = DictWriter( sys.stdout, ['Full Name (from persons)', 'officer_id (from npo_officers)']) writer.writeheader() for i, data in enumerate(gdocs_persons()): fingerprint = make_fingerprint(data['Full Name']) matching_ids = find_matching_officers(fingerprint, officer_fingerprints, excluded_ids=notable_officers, min_percentage=min_percentage) for officer_id in matching_ids: writer.writerow({ 'Full Name (from persons)': data['Full Name'], 'officer_id (from npo_officers)': officer_id, }) notable_officers.update(matching_ids) sys.stderr.write("\r%d" % (i + 1)) sys.stderr.flush() sys.stderr.write("\nDone\n")
def write(self, file_name, row): with self.lock: if file_name not in self.fhs: self.fhs[file_name] = open(make_path(file_name), 'wb') dw = DictWriter(self.fhs[file_name], row.keys()) self.writers[file_name] = dw dw.writeheader() self.writers[file_name].writerow(row)
def write(self, file_name, row): with self.lock: if file_name not in self.fhs: self.fhs[file_name] = open(make_path(file_name), 'wb') dw = DictWriter(self.fhs[file_name], row.keys()) self.writers[file_name] = dw dw.writeheader() self.writers[file_name].writerow(row)
def mappings_export(file, decided): """Export mappings to a CSV file.""" writer = DictWriter(file, fieldnames=['left', 'right', 'judgement']) writer.writeheader() for mapping in Mapping.find_by_decision(decided): writer.writerow({ 'left': mapping.left_uid, 'right': mapping.right_uid, 'judgement': mapping.judgement })
def export_aliases(project, path): """ Dump a list of all entity names to a CSV file. The table will contain the active name of each entity, and one of the other existing names as an alias. """ with open(path, 'w') as fh: writer = DictWriter(fh, ['entity_id', 'alias', 'canonical', 'schemata']) writer.writeheader() q = Entity.all().filter_by(same_as=None) q = q.filter(Entity.project==project) for i, entity in enumerate(q): export_entity(entity, writer) if i % 100 == 0: log.info("Dumped %s entity names...", i)
def export_aliases(project, path): """ Dump a list of all entity names to a CSV file. The table will contain the active name of each entity, and one of the other existing names as an alias. """ with open(path, 'w') as fh: writer = DictWriter(fh, ['entity_id', 'alias', 'canonical', 'schemata']) writer.writeheader() q = Entity.all().filter_by(same_as=None) q = q.filter(Entity.project == project) for i, entity in enumerate(q): export_entity(entity, writer) if i % 100 == 0: log.info("Dumped %s entity names...", i)
def write_csv(self, outputdir, timestamp, items=None): path = os.path.join(outputdir, self.filename('csv', timestamp, **self.filter_kwargs)) if items is None: items = self.get_items() with open(path, 'w') as csvfile: writer = DictWriter(csvfile, self.get_fields()) writer.writeheader() for row in items: writer.writerow(row) return self
def load_data(self, options): save_data = self.settings.get("__save_data__", False) if save_data: options['full_record'] = True try: os.makedirs("./saved_data") LOG.info("Saving data to %s.", os.path.abspath("./saved_data")) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir("./saved_data"): pass else: raise if self.settings['protocol_version'] == '2': if self.settings['group_dn']: users = self.query_group(options) else: users = self.query_objects(options) else: if self.settings['group_dn']: users = self.query_group_paged(options) else: users = self.query_objects_paged(options) if save_data: data = [] keys = set() for user in users: # Note: Not all user dicts contain all the fields. So, need to loop over # all the users to make sure we don't miss any fields. keys.update(user.keys()) data.append(user) used_keys = set(self.ldap_query_fields) unused_keys = set(keys) - used_keys if unused_keys: keys = sorted(used_keys) + ['unmapped ->' ] + sorted(unused_keys) else: keys = sorted(used_keys) with open('./saved_data/ldap.csv', 'w') as save_file: writer = DictUnicodeWriter(save_file, keys) writer.writeheader() writer.writerows(data) users = data for user in users: yield user
def write_csv(self, outputdir, timestamp, items=None): path = os.path.join( outputdir, self.filename('csv', timestamp, **self.filter_kwargs)) if items is None: items = self.get_items() with open(path, 'w') as csvfile: writer = DictWriter(csvfile, self.get_fields()) writer.writeheader() for row in items: writer.writerow(row) return self
def export_csv_table(archive, model, name): file_path = os.path.join(_make_export_path(), '%s.csv' % name) log.info("Exporting CSV to %s...", file_path) writer = None with open(file_path, 'w') as fh: for obj in session.query(model): row = obj.to_row() if writer is None: writer = DictWriter(fh, row.keys()) writer.writeheader() writer.writerow(row) url = archive.upload_file(file_path, mime_type='text/csv') if url is not None: os.unlink(file_path)
def convert_file(file_path): result_path = file_path.replace('/wdvc16', '/processed_wdvc16').replace('.xml', '.csv') print 'writing to %s...' % result_path xml_pages = stream_pages(file_path) fieldnames = [u'revision_id', u'revisions_in_group', u'revision_comment', u'revision_timestamp', u'page_id', u'page_group', u'page_ns', u'page_title', u'anonimous_ip', u'user_id', u'username'] with open(result_path, 'w') as csv_file: writer = DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for xml_page in tqdm(xml_pages): pages = parse_page(xml_page) for page in pages: writer.writerow(page)
def writetocsv(xmlfile=None, month=None, year=None, outfile='../data/data.csv'): if xmlfile is None: raise Exception, "No XML file passed" if month is None: raise Exception, "No month passed" if year is None: raise Exception, "No year passed" xmldata = etree.parse(xmlfile) csvwriter = None csvfile = open('../data/data.csv', 'a') for incident in xmldata.iter('DATA'): data = {'month': month, 'year': year} for field in incident.iterchildren(): data[field.tag] = field.text if not csvwriter: csvwriter = DictWriter(csvfile, fieldnames=data.keys()) csvwriter.writeheader() csvwriter.writerow(data) csvfile.close()
def report(self): trello_members, google_members, board_members = self.auditor.get_members( ) all_members = set() all_members.update(trello_members) all_members.update(google_members) for members in board_members.values(): all_members.update(members) with open(self.filename, 'wb+') as fp: csv = DictWriter(fp, ['name'] + list(all_members)) csv.writeheader() board_members['google'] = google_members board_members['trello'] = trello_members for board, members in board_members.items(): row = {member: (member in members) for member in all_members} row['name'] = board csv.writerow(row)
def convert_pickled_pages_to_csv_dataset(in_file, out_pages_fpath, out_transitions_fpath, delete_in_file = True): pages_columns = set() transitions_columns = set() with open(in_file, 'r') as inf: while True: try: page = pickle.load(inf) pages_columns.update(page.as_dict().viewkeys()) for trans in page.transitions: transitions_columns.update(trans.as_dict({ "FROM_LABEL__%s" % l : 1 for l in page.labels }).viewkeys()) except EOFError: break with open(out_pages_fpath, 'w') as pages_f, \ open(out_transitions_fpath, 'w') as trans_f: pages_writer = DictWriter(pages_f, sorted(pages_columns), encoding = 'utf8') pages_writer.writeheader() trans_writer = DictWriter(trans_f, sorted(transitions_columns), encoding = 'utf8') trans_writer.writeheader() with open(in_file, 'r') as inf: while True: try: page = pickle.load(inf) pages_writer.writerow(page.as_dict()) for trans in page.transitions: trans_writer.writerow(trans.as_dict({ "FROM_LABEL__%s" % l : 1 for l in page.labels })) except EOFError: break if delete_in_file: os.remove(in_file)
def parse_file(xml_file): print 'converting %s to csv' % xml_file # csv file name new_file_path = xml_file.replace('wdvc16', 'converted_wdvc16').replace('.xml', '.csv') print 'writing to %s' % new_file_path # page by page generator of the xml file xml_file_by_pages = page_stream_generator(xml_file) # columns columns = [u'page_title', u'page_ns', u'page_id', u'revision_id', u'revision_timestamp', u'revision_comment', u'revision_model', u'revision_format', u'revision_count', u'username', u'user_id', u'ip_address'] with open(new_file_path, 'w') as csv_file: writer = DictWriter(csv_file, fieldnames=columns) writer.writeheader() for xml_page in xml_file_by_pages: revisions_in_page = parse_page(xml_page) for page in revisions_in_page: writer.writerow(page)
class SplitCSVPipeline(object): def open_spider(self, spider): self.links_file = open('links.csv', 'wb') self.results_file = open('results.csv', 'wb') self.links_writer = DictWriter(self.links_file, ['source', 'destination']) self.results_writer = DictWriter(self.results_file, ['url', 'status', 'next']) self.links_writer.writeheader() self.results_writer.writeheader() def close_spider(self, spider): self.results_file.close() self.links_file.close() def process_item(self, item, spider): if isinstance(item, Link): self.links_writer.writerow(item) if isinstance(item, Result): self.results_writer.writerow(item) return item
"companyName": companyName, "industries": industries, "totalViews": totalViews, "url": url } totalJobs.append(item) except: pass if __name__ == '__main__': """ Provide the location and companyName """ location = "New York" companyName = "Airbnb" getjobs(companyName, location) print "Total jobs got ", len(totalJobs) file = open('finalData.csv', 'wb') fields = [ 'jobTitle', 'companyName', 'location', 'postedTime', 'totalViews', 'jobDescription', 'industries', 'employmentType', 'experience', 'employmentType', 'jobFunctions', 'url' ] csvfile = DictWriter(file, fieldnames=fields, quoting=QUOTE_ALL, encoding="utf-8") csvfile.writeheader() for i in totalJobs: csvfile.writerow(i) file.close()
def write_csv(fieldnames, rows, path): with open(path, 'wb') as fh: writer = DictWriter(fh, fieldnames) writer.writeheader() for row in rows: writer.writerow(row)
from core.models import Person2Company, Company2Company from unicodecsv import DictWriter from django.utils.translation import activate from django.conf import settings from collections import Counter from tqdm import tqdm activate(settings.LANGUAGE_CODE) with open("/tmp/positions.csv", "w") as fp: w = DictWriter(fp, fieldnames=["person", "relation", "company", "url"]) w.writeheader() for p2c in tqdm(Person2Company.objects.all().select_related( "from_person", "to_company").nocache().iterator()): w.writerow({ "person": p2c.from_person.full_name, "relation": p2c.relationship_type, "company": p2c.to_company.name, "url": "https://pep.org.ua{}".format(p2c.from_person.get_absolute_url()), }) with open("/tmp/relations.csv", "w") as fp: w = DictWriter(fp, fieldnames=[ "company1", "relation", "back_relation", "company2",
page('#parent-fieldname-text').text(), page('a.email').text(), page('#parent-fieldname-contactPhone').text(), page('a.email').attr('href').replace('mailto:', ''), page('#parent-fieldname-eventUrl').attr('href') or '' ))), ('meta_submitter_email', '*****@*****.**'), ))) with open("events.p", "wb") as dumpfile: pickle.dump(events, dumpfile) # Write output with open('output.csv', 'w') as csvfile: writer = DictWriter(csvfile, fieldnames=events[0].keys()) writer.writeheader() for event in events: writer.writerow(event) # Submit events for event in events: util.submit_event( email=event['meta_submitter_email'], title=event['title'], description=event['content_description'], location=event['location'], start_date=event['start'].split('T')[0], start_time=event['start'].split('T')[1][:5], end_time=event['end'].split('T')[1][:5], base_url=base_url )
itemtype, 'title': g.value(book, DC.title), 'date': g.value(book, DC.date) or '', # not all have dates 'tags': ', '.join(tags), '# tags': len(tags), '# tags ending in Y': len([t for t in tags if t.endswith('Y')]) }) items = sorted(items, key=lambda k: k['# tags ending in Y'], reverse=True) # generate csv file name based on input file filebase, ext = os.path.splitext(os.path.basename(args.filename)) csv_filename = '%s.csv' % filebase with open(csv_filename, 'w') as csvfile: # write byte-order-mark for utf-8 opening in csvfile.write(codecs.BOM_UTF8) fieldnames = [ 'identifier', 'type', 'title', 'date', '# tags', '# tags ending in Y', 'tags' ] writer = DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for item in items: writer.writerow(item)
# print(h.HITId) # currhits[h.HITId] = h # print('{}: {}'.format(len(currhits), currhits)) # # get_all_hits iterates through all your current HITs, grabbing 100 at a time # # best to break as soon as you get all the HITIds in your group # if len(currhits) == len(hitids): # break currhits = {h.HITId: h for h in mtc.get_all_hits() if h.HITId in hitids} print('{} Current HITs: {}'.format(len(currhits), sorted(currhits.keys()))) process_assignments(assignments, all_results, currhits) outkeys.extend(list(sorted(answer_keys))) # Structure of hits # foo.Amount foo.Expiration foo.IntegerValue foo.QualificationTypeId # foo.AssignmentDurationInSeconds foo.FormattedPrice foo.Keywords foo.RequesterAnnotation # foo.AutoApprovalDelayInSeconds foo.HIT foo.LocaleValue foo.RequiredToPreview # foo.Comparator foo.HITGroupId foo.MaxAssignments foo.Reward # foo.Country foo.HITId foo.NumberOfAssignmentsAvailable foo.Title # foo.CreationTime foo.HITReviewStatus foo.NumberOfAssignmentsCompleted # foo.CurrencyCode foo.HITStatus foo.NumberOfAssignmentsPending foo.expired # foo.Description foo.HITTypeId foo.QualificationRequirement with open(args.resultsfile, 'w') as outfile: dw = DictWriter(outfile, fieldnames=outkeys, delimiter='\t') dw.writeheader() for row in all_results: dw.writerow(row)
if __name__ == '__main__': print("Exporting tarefas to CSV...") with open(TAREFAS_DIR + '/../csv/data_exported.csv', 'wb') as fout: csv_writer = DictWriter( fout, fieldnames=[ 'id_tarefa', 'num_tarefa', 'titulo_tarefa', 'tipo_tarefa', 'data_cadastro_tarefa', 'sistema_tarefa', 'data_inicio_tarefa', 'subsistema_tarefa', 'data_deadline_tarefa', 'aberta_por_tarefa', 'localizacao_analista_tarefa', 'situacao_tarefa', 'horas_trabalhadas_tarefa', 'gerente_relacionamento_tarefa', 'num_prioridade_tarefa', 'andamento_tarefa', 'prioridade_tarefa', 'dados_build_log', 'data_cadastro_log', 'atividade_log', 'situacao_log', 'andamento_log', 'horas_trabalhadas_log', 'aberto_por_log', 'revisao_svn_log' ]) csv_writer.writeheader() for tarefa_filename in os.listdir(TAREFAS_DIR): id_tarefa = re.findall(r'(\d+)', tarefa_filename)[0] tarefa_filepath = TAREFAS_DIR + '/' + tarefa_filename with io.open(tarefa_filepath, 'r', encoding='utf-8') as fin: list_tarefa_logs = __to_list_tarefa_logs(fin.read()) for tarefa_log in list_tarefa_logs: tarefa_log.update({'id_tarefa': id_tarefa}) csv_writer.writerow(tarefa_log) print("Done!")
def dump_csv(table, name): with open(name, 'w') as fh: writer = DictWriter(fh, fieldnames=table.columns) writer.writeheader() for row in table: writer.writerow(row)
def write_csv(fieldnames, rows, path): with open(path, 'wb') as fh: writer = DictWriter(fh, fieldnames) writer.writeheader() for row in rows: writer.writerow(row)