def create_by_creationdate_artist(collection): ''' ''' tag_artist_date = defaultdict(lambda: 0) with open(collection, 'r') as f: reader = csvkit.DictReader(f) for row in reader: c_date = row['Year creation'] c_tag = row['tag_thema'].split(', ') c_name = row['name'] if not c_date.isdigit() or int(c_date) < 1868: pass if len(c_tag) > 1: for t in c_tag: tag_artist_date[(t, c_name, c_date)] += 1 else: tag_artist_date[(c_tag[0], c_name, c_date)] += 1 # Write output file with open('theme_author_date.csv', 'w') as f: writer = csvkit.DictWriter( f, fieldnames=['Theme', 'Year creation', 'Name', 'Weight']) writer.writeheader() for k in tag_artist_date: output_row = { 'Theme': k[0], 'Year creation': '01/01/%s' % k[2], 'Name': k[1], 'Weight': tag_artist_date[k] } writer.writerow(output_row)
def save_as(self, file_name): headers = self[0].keys() with open(file_name, 'wb') as output_file: print "Write %s row to %s" % (len(self), file_name) dict_writer = csv.DictWriter(output_file, headers) dict_writer.writeheader() dict_writer.writerows(self)
def write_csv(self, data, out_file): with open(out_file, 'w') as csvfile: print "Write to %s" % out_file fieldnames = data[0].keys() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in data: writer.writerow(row)
def test_writer_alias(self): writer = csvkit.DictWriter(self.output, ['a', 'b', 'c']) writer.writeheader() writer.writerow({u'a': u'1', u'b': u'2', u'c': u'☃'}) result = self.output.getvalue() self.assertEqual(result, 'a,b,c\n1,2,☃\n')
def save_csv_file(file_name, dataset): print "Save %s" % file_name with open(file_name, 'w') as csvfile: fieldnames = dataset[0].keys() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in dataset: writer.writerow(row)
def to_csv(self, path): """ Export to csv based on `.to_dictlist()` representation dimension,category,note ,,"My general note" region,,"My regional note" crime,,"My note on crime" region,"Stockholms län","My note on Stockholm" region,"Stockholms län","My 2nd note on Stockholm" """ logger.info(u"Writing to {}".format(unicode(path, "utf-8"))) data = self.to_dictlist() with open(path, 'wb') as f: w = csv.DictWriter(f, data[0].keys()) w.writeheader() w.writerows(data)
def handle_label(self, crs_file, **options): start_time = time.time() i = 0 field = options['field'] languages = [lang[0].split('-')[0] for lang in settings.LANGUAGES] if options['lang']: if options['lang'] not in languages: raise CommandError("Invalid language code '%s'. Try: %s" % (options['lang'], ', '.join(languages))) languages = [ options['lang'], ] all_fields = [ field, ] for lang in languages: all_fields.append('%s_%s' % (field, lang)) all_fields = prepare_fields(all_fields) self.stdout.write('FIELD: %s' % field) self.stdout.write('LANGUAGES: %s' % languages) self.stdout.write('ALL FIELDS: %s' % all_fields) with open(crs_file, 'w') as crs_file: writer = csvkit.DictWriter(crs_file, all_fields) writer.writeheader() for i, activity in enumerate( models.Activity.objects.exclude(**{ field: '' }).order_by(*all_fields).values(*all_fields).distinct( *all_fields)): writer.writerow(activity) self.stdout.write("\rExported activities %d" % (i, ), ending='') self.stdout.flush() self.stdout.write("\nTotal rows: %d" % i) self.stdout.write("Execution time: %d seconds" % (time.time() - start_time))
def tag_artworks(collection, output, keyword_tag_map): ''' tag all of the artworks of a collection using gephi csv Also add name of the author ''' with open(collection, 'r') as fi: with open(output, 'w') as fo: # Init csv reader reader = csvkit.DictReader(fi) header = reader.fieldnames # Init new csv with updated header #header += ['tag_ico', 'tag_theme', 'tag_mat'] header.append('tag_thema') header.append('name') writer = csvkit.DictWriter(fo, fieldnames=header) writer.writeheader() id_names_map = get_artist_names( '/home/akira/Documents/Dev/Datasprint/authors.csv', collection) for input_row in reader: # Init dict to write output_row = dict.fromkeys(header) output_row.update(input_row) # Tag thema_words = input_row['themas'].split(', ') tags = get_tags_from_field(thema_words, keyword_tag_map) output_row['tag_thema'] = ', '.join(tags) # Add author's name output_row['name'] = id_names_map[ input_row['Id artists']].replace('"', '') # Write row writer.writerow(output_row)
def create_by_acquisitiondate_acquisitionmode(collection): ''' ''' tag_acq_date = defaultdict(lambda: 0) with open(collection, 'r') as f: reader = csvkit.DictReader(f) for row in reader: c_date = row['Year acquisition'] c_tag = row['tag_thema'].split(', ') c_mode = row['Mode acquisition (new categories)'] if not c_date.isdigit() or c_mode == '' or int(c_date) < 1868: pass else: if len(c_tag) > 1: for t in c_tag: tag_acq_date[(t, c_mode, c_date)] += 1 else: tag_acq_date[(c_tag[0], c_mode, c_date)] += 1 # Write output file with open('theme_acqDate_acqMode_date.csv', 'w') as f: writer = csvkit.DictWriter(f, fieldnames=[ 'Theme', 'Year acquisition', 'Mode acquisition (new categories)', 'Weight' ]) writer.writeheader() for k in tag_acq_date: output_row = { 'Theme': k[0], 'Year acquisition': '01/01/%s' % k[2], 'Mode acquisition (new categories)': k[1], 'Weight': tag_acq_date[k] } writer.writerow(output_row)
def export_activities_by_year(year): print 'Start export for year: %s' % year zip_file_path = generate_file_path(year) if path.isfile(zip_file_path): # backup backup_file_path = generate_file_path(year, to_backup=True) rename(zip_file_path, backup_file_path) print 'Backup old export: %s' % backup_file_path output = StringIO.StringIO() ## temp output file csv_writer = csvkit.DictWriter(output, EXPORTED_FIELDS, quoting=csv.QUOTE_ALL) csv_writer.writeheader() i = 0 for i, activity in enumerate(Activity.objects.filter(year=year) if year != 'all' else Activity.objects.all(), start=1): csv_writer.writerow(serialize_activity(activity)) print "\r%d" % i, sys.stdout.flush() csv_file = zipfile.ZipFile(zip_file_path, 'w') csv_file.writestr(path.basename(zip_file_path)[:-4], output.getvalue()) print '%d Activities for year %s exported in: %s' % (i, year, zip_file_path)
# expected format in input.csv: first column 'id', second column 'address' with open('input.csv', 'r') as f: reader = csvkit.DictReader(f) all_rows = [] for row in reader: try: parsed_addr = usaddress.tag(row['address']) row_dict = parsed_addr[0] except: row_dict = {'error': 'True'} row_dict['id'] = row['id'] all_rows.append(row_dict) field_list = [ 'id', 'AddressNumber', 'AddressNumberPrefix', 'AddressNumberSuffix', 'BuildingName', 'CornerOf', 'IntersectionSeparator', 'LandmarkName', 'NotAddress', 'OccupancyType', 'OccupancyIdentifier', 'PlaceName', 'Recipient', 'StateName', 'StreetName', 'StreetNamePreDirectional', 'StreetNamePreModifier', 'StreetNamePreType', 'StreetNamePostDirectional', 'StreetNamePostModifier', 'StreetNamePostType', 'SubaddressIdentifier', 'SubaddressType', 'USPSBoxGroupID', 'USPSBoxGroupType', 'USPSBoxID', 'USPSBoxType', 'ZipCode', 'error' ] with open('output.csv', 'wb') as outfile: writer = csvkit.DictWriter(outfile, field_list) writer.writeheader() writer.writerows(all_rows)
print "%s nombre de slugs uniques in swap" % len(uniquSlugsInSwap) uniquSlugsInSource = set(source['slug'] for source in sources) print "%s nombre de slugs uniques in sources" % len(uniquSlugsInSource) print "in swap not in source :" print "\n".join([ slug.encode('utf8') for slug in uniquSlugsInSwap - uniquSlugsInSource ]) print "\n".join([ slug.encode('utf8') for slug, ss in itertools.groupby(sources, lambda s: s['slug']) if len(list(ss)) > 1 ]) # output the new sources file with open('sources.csv', 'w') as of: output = csvkit.DictWriter(of, sources[0].keys()) output.writeheader() output.writerows(sources) # delete source_types.csv (by hand through git) # patch flows and exchange_rates through csvkit directly on csv # check for missing sources on the way missingSources = set() with open('../../csv_data/flows.csv', 'r') as f: with open('../../csv_data/new_flows.csv', 'w') as nf: flows = csvkit.DictReader(f) newFlows = csvkit.DictWriter(nf, flows.fieldnames) newFlows.writeheader() for flow in flows: if flow['source'] in swapSources: flow['source'] = swapSources[flow['source']]
# create a new slug s['new_slug'] = slugify(s) # control slug unicity if s['new_slug'] in slugs: slugs[s['new_slug']] += 1 else: slugs[s['new_slug']] = 1 # control slug unicity print "%s sources have duplicated slugs" % len( [(s, nb) for (s, nb) in slugs.iteritems() if nb > 1]) for s in sources: s['new_slug_nb'] = slugs[s['new_slug']] # add nb flows s['nb_flows'] = int(nb_flows_by_sources[ s['slug']]) if s['slug'] in nb_flows_by_sources else 0 # add a note and action column s['put x to remove'] = '' # export with open('new_sources.csv', 'w') as outputFile: print "writing %s line to new_sources.csv" % len(sources) headers = [ 'put x to remove', 'new_slug_nb', 'nb_flows', 'new_slug', 'slug', 'author', 'name', 'author_editor', 'country', 'volume_number', 'volume_date', 'edition_date', 'pages', 'shelf_number', 'source_category', 'URL', 'type', 'notes', 'flow_date' ] output = csvkit.DictWriter(outputFile, headers) output.writeheader() output.writerows(sources)
import json import csvkit import math with open("../artists_profils_sequences.json", "r") as f: artists_profils = json.load(f) er_authors_whitelist = [] with open("../er_artists_artworks.json", "r") as f: for artist, artworks in json.load(f).iteritems(): for artwork in artworks: er_authors_whitelist.append(artwork["authors"]) er_authors_whitelist = er_authors_whitelist with open("artists_node_attribute.csv", "w") as of: output = csvkit.DictWriter( of, fieldnames=["ID", "Label", "sequence_length_type", "ER_member"]) output.writeheader() short_nb = 0 medium_nb = 0 large_nb = 0 for a in artists_profils: if len(a["event_sequence_with_0"]) < 40: seq_len_type = "short" short_nb += 1 elif len(a["event_sequence_with_0"]) >= 40 and len( a["event_sequence_with_0"]) < 190: seq_len_type = "medium" medium_nb += 1 elif len(a["event_sequence_with_0"]) >= 190:
import pymongo import csvkit db = pymongo.MongoClient("localhost", 27017)["mnam"] headers = [ "_id", "key_words_movement", "key_words_thema", "key_words_icono", "domain_description_mst" ] tf = db.Artwork.aggregate([{ "$match": { "notEnsemble": True } }, { "$project": { "key_words_movement": 1, "key_words_thema": 1, "key_words_icono": 1, "domain_description_mst": 1 } }]) with open("textfields.csv", "w") as f: csv = csvkit.DictWriter(f, fieldnames=headers) csv.writeheader() csv.writerows(tf)
def export_RICentities_FT_comparision(cursor, output_filename, table='flow_joined'): cursor.row_factory = sqlite3.Row select_RICentities = """ SELECT RICname, type, continent, GPH_code, sum(COALESCE(report.nb_flows,0)) as nb_flows_as_reporting, sum(COALESCE(partner.nb_flows,0)) as nb_flows_as_partner FROM RICentities LEFT JOIN (SELECT count(id) as nb_flows, reporting FROM %(table)s where partner not like 'world%%' group by reporting) as report on report.reporting = RICname LEFT JOIN (SELECT count(id) as nb_flows, partner FROM %(table)s group by partner) as partner on partner.partner = RICname WHERE RICname not LIKe 'World%%' group by RICname HAVING nb_flows_as_reporting != 0 OR nb_flows_as_partner != 0 ORDER BY nb_flows_as_reporting DESC, nb_flows_as_partner DESC """ % { 'table': table } cursor.execute(select_RICentities) RICentities = {} for ric in cursor: RICentities[ric[0]] = dict(ric) # FT reportings select_FT_RICentities_number = """ SELECT year, count(distinct reporting) as nb_flows_FT FROM %s WHERE partner = 'World Federico Tena' GROUP BY year ORDER BY year """ % table ft_reportings_by_year = dict( (str(y), n) for (y, n) in cursor.execute(select_FT_RICentities_number)) select_reportings = """ SELECT reporting, year, ft.FT, count(id) as nb_flows FROM %s LEFT JOIN ( SELECT reporting, year, 1 as FT FROM %s WHERE partner = 'World Federico Tena' GROUP BY reporting, year ) as ft USING (reporting, year) WHERE partner NOT LIKE 'World%%' GROUP BY reporting, year;""" % (table, table) for (reporting, year, ft, nb_flows) in cursor.execute(select_reportings): if reporting not in RICentities: print('undocumented RIC %s' % reporting) RICentities[reporting] = { 'RICname': reporting, 'nb_flows_as_reporting': nb_flows, 'nb_flows_as_partner': 0 } # y'a un probleme avec nb_flows_as_reporting RICentities[reporting][str( year)] = "ft_reporting" if ft else "reporting" select_partners = """ SELECT f.partner, f.year, ft.FT, count(id) as nb_flows FROM %s as f LEFT JOIN ( SELECT reporting, year, 1 as FT FROM %s WHERE partner = 'World Federico Tena' GROUP BY reporting, year ) as ft ON f.partner=ft.reporting AND f.year = ft.year WHERE f.partner NOT LIKE 'World%%' GROUP BY f.partner, f.year;""" % (table, table) for (partner, year, ft, nb_flows) in cursor.execute(select_partners): if partner not in RICentities: print('undocumented RIC %s' % partner) RICentities[partner] = { 'RICname': partner, 'nb_flows_as_reporting': 0, 'nb_flows_as_partner': nb_flows } if str(year) not in RICentities[partner]: RICentities[partner][str( year)] = "ft_partner_only" if ft else "partner_only" # y'a un bug la !! RICentities[partner]['nb_flows_as_partner'] = nb_flows cursor.execute( 'SELECT min(year) as min_year, max(year) as max_year from %s' % table) (min_year, max_year) = next(cursor) years = [str(y) for y in range(min_year, max_year + 1)] nb_entities_in_ft_and_ricardo = dict((y, 0) for y in years) nb_entities_in_ricardo_not_in_ft = dict((y, 0) for y in years) for r in RICentities.values(): for y in years: if y in r and 'ft' in r[y]: nb_entities_in_ft_and_ricardo[y] += 1 elif y in r: nb_entities_in_ricardo_not_in_ft[y] += 1 with open(output_filename, "w") as f: hs = [ 'RICname', 'type', 'continent', 'GPH_code', 'nb_flows_as_reporting', 'nb_flows_as_partner' ] + [y for y in years] dw = csvkit.DictWriter(f, fieldnames=hs) ft_reportings_by_year['nb_flows_as_partner'] = 'nb FT reportings' nb_entities_in_ft_and_ricardo['nb_flows_as_partner'] = 'nb in FT & RIC' nb_entities_in_ricardo_not_in_ft[ 'nb_flows_as_partner'] = 'nb in RIC not in FT' dw.writeheader() dw.writerow(ft_reportings_by_year) dw.writerow(nb_entities_in_ft_and_ricardo) dw.writerow(nb_entities_in_ricardo_not_in_ft) dw.writerows( sorted((r for r in RICentities.values()), key=lambda r: -1 * (r['nb_flows_as_reporting'] + r['nb_flows_as_partner']))) return 0 return 1
def export_RICentities_csv(cursor, output_filename): cursor.row_factory = sqlite3.Row select_RICentities = """ SELECT * from RICentities """ cursor.execute(select_RICentities) RICentities = {} for ric in cursor: RICentities[ric[0]] = { 'RICname': ric[0], 'RICtype': ric[1], 'continent': ric[2], 'GPH code': ric[3] } select_reportings = """ SELECT reporting, GROUP_CONCAT(original_reporting,'|') as original_names, GROUP_CONCAT(source_label,'|') as sources, GROUP_CONCAT(distinct year), count(*) as nb_flows FROM flow_joined WHERE partner NOT LIKE 'World%' GROUP BY reporting""" for reporting in cursor.execute(select_reportings): RICentities[reporting[0]]['names in source (reporting)'] = "; ".join( set(reporting[1].split('|'))) RICentities[reporting[0]]['sources (reporting)'] = "; ".join( set(reporting[2].split('|'))) RICentities[reporting[0]]['bilateral periods (reporting)'] = ','.join( '-'.join(str(e) for e in p) for p in reduce_years_list_into_periods(reporting[3].split(','))) RICentities[reporting[0]]['nb flows (reporting)'] = reporting[4] RICentities[reporting[0]]['total nb flows'] = reporting[4] select_partners = """ SELECT partner, GROUP_CONCAT(original_partner,'|') as original_names, GROUP_CONCAT(source_label,'|') as sources, GROUP_CONCAT(distinct year), count(*) as nb_flows FROM flow_joined WHERE partner NOT LIKE 'World%' GROUP BY partner""" for partner in cursor.execute(select_partners): RICentities[partner[0]]['names in source (partner)'] = "; ".join( set(partner[1].split('|'))) RICentities[partner[0]]['sources (partner)'] = "; ".join( set(partner[2].split('|'))) RICentities[partner[0]]['bilateral periods (partner)'] = ','.join( '-'.join(str(e) for e in p) for p in reduce_years_list_into_periods(partner[3].split(','))) RICentities[partner[0]]['nb flows (partner)'] = partner[4] if 'total nb flows' in RICentities[partner[0]]: RICentities[partner[0]]['total nb flows'] += partner[4] else: RICentities[partner[0]]['total nb flows'] = partner[4] with open(output_filename, "w") as f: hs = [ 'RICname', 'RICtype', 'continent', 'GPH code', 'total nb flows', 'nb flows (reporting)', 'nb flows (partner)', 'names in source (reporting)', 'names in source (partner)', 'bilateral periods (reporting)', 'bilateral periods (partner)', 'sources (reporting)', 'sources (partner)' ] dw = csvkit.DictWriter(f, fieldnames=hs) dw.writeheader() dw.writerows( sorted((r for r in RICentities.values() if 'total nb flows' in r), key=lambda r: -1 * r['total nb flows'])) return 0 return 1
key=lambda e: (e["sourcetype"], e["year"], e["direction"] if "direction" in e else "", e["exportsimports"] if "exportsimports" in e else "", e["numrodeligne"] if ("numrodeligne" in e and e["numrodeligne"]) else "", e[ "marchandises"], e["pays"] if "pays" in e else "")) # Cleaning sources for row in sources_aggregation: for k in row: row[k] = clean(row[k]) headers = set(headers) headers = [h for h in headers if h not in ordered_headers] headers = ordered_headers + headers with open(output_filename, "w") as output_file: agg_csv = csvkit.DictWriter(output_file, headers, encoding="utf-8") agg_csv.writeheader() agg_csv.writerows(sources_aggregation) #csvsort -c SourceType,year,direction,exportsimports,numrodeligne,marchandises,pays "$f" > last_ordered.csv #taking care of 0/missing in "values" and in "prix_unitaire" def clean_float_string(f): f = re.sub(r"[,,、،﹐﹑]", ".", f) f = re.sub(r"[\s ]", "", f) return f with open(output_filename, "r") as output_file:
r = csvkit.DictReader(source_file) headers += r.fieldnames headers = set(headers) headers = [h for h in headers if h not in ordered_headers] headers = ordered_headers + headers for extra_header in [ "value_as_reported", "computed_value", "replace_computed_up" ]: if extra_header not in headers: headers += [extra_header] # Then we actually read and write the lines with open(output_filename, "w") as output_file: writer = csvkit.DictWriter(output_file, headers, encoding="utf-8") writer.writeheader() for (dirpath, dirnames, filenames) in os.walk(directory): if not sum(dirpath == os.path.join(directory, b) for b in black_list): for csv_file_name in filenames: ext = csv_file_name.split( ".")[-1] if "." in csv_file_name else None if ext == "csv": print "%s in %s" % (csv_file_name, dirpath) filepath = os.path.join(dirpath, csv_file_name) with open(filepath, "r") as source_file: r = csvkit.DictReader(source_file)
# unique arworks unique_artworks = db.Artwork.aggregate([ {"$match": {"type":{'$in':['individual','nonseparable']}}}, {"$project":project} ]) separable_artworks_groups = db.Artwork.aggregate([ {"$match": {"type":'separable'}}, {'$group': dict( [('_id','ensemble_id')] + [(k,{'$first':'$%s'%k}) for k,v in project.items()])}, {"$project":project} ]) artworks = list(unique_artworks)+list(separable_artworks_groups) for artwork in artworks: # cleaning acquisition mode artwork['new_acquisition_mode'] = cleaning.acquisition_mode_cleaning(artwork['acquisition_mode']) # creation date artwork['creation_year'] = cleaning.creation_date_cleaning(artwork['date_creation']) if 'date_creation' in artwork else '' try: artwork['acq_crea_diff'] = int(artwork['acquisition_year'])-int(artwork['creation_year']) except: artwork['acq_crea_diff'] = None artwork['authors'] = '|'.join(artwork['authors']) with open("unique_artworks.csv", "w") as f: artworks_csv = csvkit.DictWriter(f,fieldnames = headers.values()) artworks_csv.writeheader() # setting human readable column names artworks_csv.writerows(( { headers[k]:v for k,v in artwork.items()} for artwork in artworks))
def analyzeAllEEG(): """ reads the entire table of eeg reports (input_file_name) which is a csv file and creates a new csv file with new columns which will include the likelihood score of it containing seizures/ed. :return: nothing. this function writes to the outfile """ i = 1 with open(input_file_name) as cf: # needed to replace null lines reader = csv.DictReader(x.replace('\0', '') for x in cf) # optional starting offset in case program crashes while analyzing # for line in reader: # i += 1 # if (i>9425): # i=1 # break outfieldnames = reader.fieldnames outfieldnames.append( 'notebody') # body of the procedure note, excludies impression outfieldnames.append('examno') # exam number ie A033 outfieldnames.append('impressionBody') # impression block outfieldnames.append( 'impression' ) # just says whether or not the eeg report was abnormal or normal # the algorithm only looks at specific commonly used phrases outfieldnames.append( 'notetype') # spot v ambulatory v long term monitoring... outfieldnames.append( 'duration' ) # duration in the format of Days, Hours:Minutes:Seconds outfieldnames.append( 'impressionType' ) # ed, dc, sz - marks each report with these tags based of the sentiment score # next two column of features are based of a better algorithm that will try to understand the syntax of each # sentence instead of just looking for specific phrases (dubbed "sentiment" analysis) outfieldnames.append('epileptiformScore') outfieldnames.append('seizureScore') writer = csv.DictWriter(out_file, fieldnames=outfieldnames, restval='*') writer.writeheader() for line in reader: i += 1 eeg_no = "" m = re_eegno.search(line['note']) # write the notebody and impression # impressions are determined by matching for specific phrases # this method was not very reliable if m: line['notebody'] = line['note'][:m.start()] eeg_no = m.group('eegno') findTrueImpression(eeg_no, line, i) setImpressionBody(m, line, i) else: # try a looser find that may introduce more false information (more sensitive less specific pattern) m = re_eegnoLoose.search(line['note']) if m: line['notebody'] = line['note'][:m.start()] eeg_no = m.group('eegno') findTrueImpression(eeg_no, line, i) elif "preliminary" in line['note'].lower(): line['impression'] = "prelim" elif "prelim" in line['note'].lower(): line['impression'] = "prelim" #else: #print("###IMPRESSION BLOCK MISSING-" + repr(i) + line['note']) #print() # figure out the duration by looking up the duration of the study based on the exam number from NK database m = re_eegnoType.search(line['note']) line['duration'] = None if m: line['duration'] = masterCSV_daterange(m.group(0)) if line['duration'] is None: # write the duration by some other messier means through regular expression matching m = re_eegnoDuration.search(line['note']) if m: match = m.group(1) line['duration'] = match else: m = re_eegnoDuration2.search(line['note']) if m: match = m.group(1) if "hour" in match: line['duration'] = m.group(2) + ':00:00' elif "day" in match: line['duration'] = repr( int(m.group(3)) * 24) + ':00:00' else: m = re_eegnoDuration3.search(line['note']) if m: line['duration'] = durationByDates( m.group(1), m.group(2)) else: m = re_eegnoDuration4.search(line['note']) if m: line['duration'] = durationByHoursHopefully( line['note']) else: m = re_eegnoDuration5.search(line['note']) if m: print(m.group(1)) line['duration'] = durationByDates( m.group(1), m.group(2)) # write the notetype m = re_eegnoType.search(line['note']) if m: match = m.group(0) match = match.lower() line['examno'] = match if ("s" in match): line['notetype'] = "spot" elif ("v" in match): line['notetype'] = "ceeg" elif ("a" in match): line['notetype'] = "ambu" elif ("f" in match): line['notetype'] = "spot inpt" elif ("e" in match): line['notetype'] = "ceeg" else: line['notetype'] = "unk" # determine procedure type by duration # make sure ambulatories are >24 dtformat = '%H:%M:%S' try: t = datetime.datetime.strptime(line['duration'], dtformat) totMinutes = t.minute + t.hour * 60 # determine the unknown report based off the duration only if line['duration'] is not None: if line['notetype'] is None: if totMinutes < 100 and totMinutes > 18: line['notetype'] = "spot?" pass if "day" not in line['duration']: if "ambu" is line['notetype']: if totMinutes < 1320 and totMinutes > 1: line['notetype'] = "longspot" except: pass # write sentiment score line['seizureScore'] = sentimentAnalysisForSeizure( line['impressionBody']) line['epileptiformScore'] = sentimentAnalysisForEpileptiform( line['impressionBody']) # write the abnormality type line['impressionType'] = "" if int(line['seizureScore']) < 0: line['impressionType'] += ' sz' if int(line['epileptiformScore']) < 0: line['impressionType'] += ' ed' writer.writerow(line)
comments = retrieve_comments(service, '1cPuHCoLshw_srl-OG7dzdk4kDr7mwPGUUgW55JRPQlk') output = [] slugs = { 'Jennifer Lee': 'jennifer-lee', 'Nikole Hannah-Jones': 'nikole-hannah-jones', 'Clay Risen': 'clay-risen', 'Nicholas Espiritu': 'nicholas-espiritu', 'Samuel Bagenstos': 'samuel-bagenstos', 'Anonymous': 'nina-totenberg', 'PHS': 'phs', 'awheeler': 'mark-updegrove', } comments = sorted(comments, key=lambda c: c['anchor']) for comment in comments: d = {} name = comment['author']['displayName'] d['author_key'] = slugs[name] d['content'] = comment['content'] d['cited'] = comment['context']['value'] output.append(d) with open('comments.csv', 'w') as f: writer = csv.DictWriter(f, ['author_key', 'content', 'cited']) writer.writeheader() writer.writerows(output)
import csvkit with open('../../csv_data/sources.csv', 'r') as sf: sources = csvkit.DictReader(sf) with open('new_sources.csv', 'r') as nsf: new_sources = csvkit.DictReader(nsf) sourcesSlugs = set(s['slug'] for s in sources) newSourcesSlugs = set(s['slug'] for s in new_sources) inSourceNotInNew = sourcesSlugs - newSourcesSlugs inNewNotInSource = newSourcesSlugs - sourcesSlugs with open('sourceTroubles.csv', 'w') as of: sourceTroubles = csvkit.DictWriter(of, ['source', 'set']) sourceTroublesData = [{ 'source': s, 'set': 'inSourceNotInNew' } for s in inSourceNotInNew] sourceTroublesData += [{ 'source': s, 'set': 'inNewNotInSource' } for s in inNewNotInSource] sourceTroublesData = sorted(sourceTroublesData, key=lambda e: e['source']) sourceTroubles.writeheader() sourceTroubles.writerows(sourceTroublesData)
import requests import time import csvkit as csv from bs4 import BeautifulSoup as Soup URL = 'http://espn.go.com/golf/leaderboard11/controllers/ajax/playerDropdown' fieldnames = ['playerId', 'tournamentId', 'hole', 'par', 'score'] outfile = open('data_out.csv', 'w+') outwriter = csv.DictWriter(outfile, fieldnames=fieldnames) outwriter.writeheader() def parseTable(table, playerId, tournamentId): data = [r.find_all('td') for r in table.find_all('tr')] out = [] for i, x in enumerate(data[0]): if not x.text.isdigit(): continue # skip any weird values try: int(data[2][i].text) except ValueError as err: continue row = { 'playerId': playerId, 'tournamentId': tournamentId, 'hole': int(x.text), 'par': int(data[1][i].text),
} project = { 'type':1, 'authors_birth_death':1, 'name.notice':1, 'name_complement':1, 'gender':1, 'nationality':1, 'artworks':1 } # unique arworks authors = list(db.Author.aggregate([ {"$project":project} ])) for author in authors: # cleaning acquisition mode if 'authors_birth_death' in author: author.update(cleaning.artist_birthdeath_parsing(author['authors_birth_death'])) del author['authors_birth_death'] author['artworks'] = '|'.join(author['artworks']) author['name.notice']= author["name"]["notice"] del author["name"] with open("authors.csv", "w") as f: authors_csv = csvkit.DictWriter(f,fieldnames = headers.values()) authors_csv.writeheader() # setting human readable column names authors_csv.writerows(( { headers[k]:v for k,v in author.items()} for author in authors))
int(len(artists_profils) * artists_random_select_percentage)) print "filtered artists to %s" % len(artists_profils) artists_comparision_matrix = [] print "computing..." counter = 0 # n! / r! / (n-r)! total = math.factorial(len(artists_profils)) / math.factorial( 2) / math.factorial(len(artists_profils) - 2) print "computing... %s combinations" % total for artists in itertools.combinations(artists_profils, 2): counter += 1 if counter % 10000 == 0: print "processed %s combinations %.4f%%" % (counter, counter / total * 100) with open(output_filename, "w") as ff: output = csvkit.DictWriter( ff, fieldnames=['Source', 'Target', 'Weight']) output.writeheader() output.writerows(artists_comparision_matrix) kval = compare(artists) #print kval artists_comparision_matrix.append({ "Source": artists[0]["name"], "Target": artists[1]["name"], "Weight": kval }) print "done" with open(output_filename, "w") as ff: output = csvkit.DictWriter(ff, fieldnames=['Source', 'Target', 'Weight']) output.writeheader()