Ejemplo n.º 1
0
def create_by_creationdate_artist(collection):
    '''
	'''
    tag_artist_date = defaultdict(lambda: 0)

    with open(collection, 'r') as f:
        reader = csvkit.DictReader(f)
        for row in reader:
            c_date = row['Year creation']
            c_tag = row['tag_thema'].split(', ')
            c_name = row['name']
            if not c_date.isdigit() or int(c_date) < 1868:
                pass
            if len(c_tag) > 1:
                for t in c_tag:
                    tag_artist_date[(t, c_name, c_date)] += 1
            else:
                tag_artist_date[(c_tag[0], c_name, c_date)] += 1

    # Write output file
    with open('theme_author_date.csv', 'w') as f:
        writer = csvkit.DictWriter(
            f, fieldnames=['Theme', 'Year creation', 'Name', 'Weight'])
        writer.writeheader()
        for k in tag_artist_date:
            output_row = {
                'Theme': k[0],
                'Year creation': '01/01/%s' % k[2],
                'Name': k[1],
                'Weight': tag_artist_date[k]
            }
            writer.writerow(output_row)
Ejemplo n.º 2
0
 def save_as(self, file_name):
     headers = self[0].keys()
     with open(file_name, 'wb') as output_file:
         print "Write %s row to %s" % (len(self), file_name)
         dict_writer = csv.DictWriter(output_file, headers)
         dict_writer.writeheader()
         dict_writer.writerows(self)
Ejemplo n.º 3
0
 def write_csv(self, data, out_file):
     with open(out_file, 'w') as csvfile:
         print "Write to %s" % out_file
         fieldnames = data[0].keys()
         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
         for row in data:
             writer.writerow(row)
Ejemplo n.º 4
0
    def test_writer_alias(self):
        writer = csvkit.DictWriter(self.output, ['a', 'b', 'c'])
        writer.writeheader()
        writer.writerow({u'a': u'1', u'b': u'2', u'c': u'☃'})

        result = self.output.getvalue()

        self.assertEqual(result, 'a,b,c\n1,2,☃\n')
Ejemplo n.º 5
0
def save_csv_file(file_name, dataset):
    print "Save %s" % file_name
    with open(file_name, 'w') as csvfile:
        fieldnames = dataset[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in dataset:
            writer.writerow(row)
Ejemplo n.º 6
0
    def to_csv(self, path):
        """ Export to csv based on `.to_dictlist()` representation

            dimension,category,note
            ,,"My general note"
            region,,"My regional note"
            crime,,"My note on crime"
            region,"Stockholms län","My note on Stockholm"
            region,"Stockholms län","My 2nd note on Stockholm"
        """
        logger.info(u"Writing to {}".format(unicode(path, "utf-8")))
        data = self.to_dictlist()
        with open(path, 'wb') as f:
            w = csv.DictWriter(f, data[0].keys())
            w.writeheader()
            w.writerows(data)
Ejemplo n.º 7
0
    def handle_label(self, crs_file, **options):

        start_time = time.time()
        i = 0
        field = options['field']

        languages = [lang[0].split('-')[0] for lang in settings.LANGUAGES]
        if options['lang']:
            if options['lang'] not in languages:
                raise CommandError("Invalid language code '%s'. Try: %s" %
                                   (options['lang'], ', '.join(languages)))
            languages = [
                options['lang'],
            ]

        all_fields = [
            field,
        ]
        for lang in languages:
            all_fields.append('%s_%s' % (field, lang))
        all_fields = prepare_fields(all_fields)

        self.stdout.write('FIELD: %s' % field)
        self.stdout.write('LANGUAGES: %s' % languages)
        self.stdout.write('ALL FIELDS: %s' % all_fields)

        with open(crs_file, 'w') as crs_file:

            writer = csvkit.DictWriter(crs_file, all_fields)
            writer.writeheader()
            for i, activity in enumerate(
                    models.Activity.objects.exclude(**{
                        field: ''
                    }).order_by(*all_fields).values(*all_fields).distinct(
                        *all_fields)):
                writer.writerow(activity)
                self.stdout.write("\rExported activities %d" % (i, ),
                                  ending='')
                self.stdout.flush()

        self.stdout.write("\nTotal rows: %d" % i)
        self.stdout.write("Execution time: %d seconds" %
                          (time.time() - start_time))
Ejemplo n.º 8
0
def tag_artworks(collection, output, keyword_tag_map):
    '''
		tag all of the artworks of a collection using 
		gephi csv
		Also add name of the author
	'''
    with open(collection, 'r') as fi:
        with open(output, 'w') as fo:

            # Init csv reader
            reader = csvkit.DictReader(fi)
            header = reader.fieldnames

            # Init new csv with updated header
            #header += ['tag_ico', 'tag_theme', 'tag_mat']
            header.append('tag_thema')
            header.append('name')
            writer = csvkit.DictWriter(fo, fieldnames=header)
            writer.writeheader()

            id_names_map = get_artist_names(
                '/home/akira/Documents/Dev/Datasprint/authors.csv', collection)

            for input_row in reader:
                # Init dict to write
                output_row = dict.fromkeys(header)
                output_row.update(input_row)

                # Tag
                thema_words = input_row['themas'].split(', ')
                tags = get_tags_from_field(thema_words, keyword_tag_map)
                output_row['tag_thema'] = ', '.join(tags)

                # Add author's name
                output_row['name'] = id_names_map[
                    input_row['Id artists']].replace('"', '')

                # Write row
                writer.writerow(output_row)
Ejemplo n.º 9
0
def create_by_acquisitiondate_acquisitionmode(collection):
    '''
	'''
    tag_acq_date = defaultdict(lambda: 0)

    with open(collection, 'r') as f:
        reader = csvkit.DictReader(f)
        for row in reader:
            c_date = row['Year acquisition']
            c_tag = row['tag_thema'].split(', ')
            c_mode = row['Mode acquisition (new categories)']
            if not c_date.isdigit() or c_mode == '' or int(c_date) < 1868:
                pass
            else:
                if len(c_tag) > 1:
                    for t in c_tag:
                        tag_acq_date[(t, c_mode, c_date)] += 1
                else:
                    tag_acq_date[(c_tag[0], c_mode, c_date)] += 1

    # Write output file
    with open('theme_acqDate_acqMode_date.csv', 'w') as f:
        writer = csvkit.DictWriter(f,
                                   fieldnames=[
                                       'Theme', 'Year acquisition',
                                       'Mode acquisition (new categories)',
                                       'Weight'
                                   ])
        writer.writeheader()
        for k in tag_acq_date:
            output_row = {
                'Theme': k[0],
                'Year acquisition': '01/01/%s' % k[2],
                'Mode acquisition (new categories)': k[1],
                'Weight': tag_acq_date[k]
            }
            writer.writerow(output_row)
Ejemplo n.º 10
0
def export_activities_by_year(year):

    print 'Start export for year: %s' % year

    zip_file_path = generate_file_path(year)
    if path.isfile(zip_file_path):
        # backup
        backup_file_path = generate_file_path(year, to_backup=True)
        rename(zip_file_path, backup_file_path)
        print 'Backup old export: %s' % backup_file_path

    output = StringIO.StringIO() ## temp output file
    csv_writer = csvkit.DictWriter(output, EXPORTED_FIELDS, quoting=csv.QUOTE_ALL)
    csv_writer.writeheader()
    i = 0
    for i, activity in enumerate(Activity.objects.filter(year=year) if year != 'all' else Activity.objects.all(), start=1):
        csv_writer.writerow(serialize_activity(activity))
        print "\r%d" % i,
        sys.stdout.flush()

    csv_file = zipfile.ZipFile(zip_file_path, 'w')
    csv_file.writestr(path.basename(zip_file_path)[:-4], output.getvalue())

    print '%d Activities for year %s exported in: %s' % (i, year, zip_file_path)
Ejemplo n.º 11
0
# expected format in input.csv: first column 'id', second column 'address'
with open('input.csv', 'r') as f:
    reader = csvkit.DictReader(f)

    all_rows = []
    for row in reader:
        try:
            parsed_addr = usaddress.tag(row['address'])
            row_dict = parsed_addr[0]
        except:
            row_dict = {'error': 'True'}

        row_dict['id'] = row['id']
        all_rows.append(row_dict)

field_list = [
    'id', 'AddressNumber', 'AddressNumberPrefix', 'AddressNumberSuffix',
    'BuildingName', 'CornerOf', 'IntersectionSeparator', 'LandmarkName',
    'NotAddress', 'OccupancyType', 'OccupancyIdentifier', 'PlaceName',
    'Recipient', 'StateName', 'StreetName', 'StreetNamePreDirectional',
    'StreetNamePreModifier', 'StreetNamePreType', 'StreetNamePostDirectional',
    'StreetNamePostModifier', 'StreetNamePostType', 'SubaddressIdentifier',
    'SubaddressType', 'USPSBoxGroupID', 'USPSBoxGroupType', 'USPSBoxID',
    'USPSBoxType', 'ZipCode', 'error'
]

with open('output.csv', 'wb') as outfile:
    writer = csvkit.DictWriter(outfile, field_list)
    writer.writeheader()
    writer.writerows(all_rows)
Ejemplo n.º 12
0
    print "%s nombre de slugs uniques in swap" % len(uniquSlugsInSwap)
    uniquSlugsInSource = set(source['slug'] for source in sources)
    print "%s nombre de slugs uniques in sources" % len(uniquSlugsInSource)
    print "in swap not in source :"
    print "\n".join([
        slug.encode('utf8') for slug in uniquSlugsInSwap - uniquSlugsInSource
    ])
    print "\n".join([
        slug.encode('utf8')
        for slug, ss in itertools.groupby(sources, lambda s: s['slug'])
        if len(list(ss)) > 1
    ])

    # output the new sources file
    with open('sources.csv', 'w') as of:
        output = csvkit.DictWriter(of, sources[0].keys())
        output.writeheader()
        output.writerows(sources)

# delete source_types.csv (by hand through git)
# patch flows and exchange_rates through csvkit directly on csv
# check for missing sources on the way
    missingSources = set()
    with open('../../csv_data/flows.csv', 'r') as f:
        with open('../../csv_data/new_flows.csv', 'w') as nf:
            flows = csvkit.DictReader(f)
            newFlows = csvkit.DictWriter(nf, flows.fieldnames)
            newFlows.writeheader()
            for flow in flows:
                if flow['source'] in swapSources:
                    flow['source'] = swapSources[flow['source']]
Ejemplo n.º 13
0
            # create a new slug
            s['new_slug'] = slugify(s)
            # control slug unicity
            if s['new_slug'] in slugs:
                slugs[s['new_slug']] += 1
            else:
                slugs[s['new_slug']] = 1
        # control slug unicity
        print "%s sources have duplicated slugs" % len(
            [(s, nb) for (s, nb) in slugs.iteritems() if nb > 1])
        for s in sources:
            s['new_slug_nb'] = slugs[s['new_slug']]
            # add nb flows
            s['nb_flows'] = int(nb_flows_by_sources[
                s['slug']]) if s['slug'] in nb_flows_by_sources else 0
            # add a note and action column
            s['put x to remove'] = ''
        # export
        with open('new_sources.csv', 'w') as outputFile:
            print "writing %s line to new_sources.csv" % len(sources)
            headers = [
                'put x to remove', 'new_slug_nb', 'nb_flows', 'new_slug',
                'slug', 'author', 'name', 'author_editor', 'country',
                'volume_number', 'volume_date', 'edition_date', 'pages',
                'shelf_number', 'source_category', 'URL', 'type', 'notes',
                'flow_date'
            ]
            output = csvkit.DictWriter(outputFile, headers)
            output.writeheader()
            output.writerows(sources)
Ejemplo n.º 14
0
import json
import csvkit
import math

with open("../artists_profils_sequences.json", "r") as f:
    artists_profils = json.load(f)
    er_authors_whitelist = []
    with open("../er_artists_artworks.json", "r") as f:
        for artist, artworks in json.load(f).iteritems():
            for artwork in artworks:
                er_authors_whitelist.append(artwork["authors"])
        er_authors_whitelist = er_authors_whitelist

    with open("artists_node_attribute.csv", "w") as of:
        output = csvkit.DictWriter(
            of,
            fieldnames=["ID", "Label", "sequence_length_type", "ER_member"])
        output.writeheader()
        short_nb = 0
        medium_nb = 0
        large_nb = 0

        for a in artists_profils:
            if len(a["event_sequence_with_0"]) < 40:
                seq_len_type = "short"
                short_nb += 1
            elif len(a["event_sequence_with_0"]) >= 40 and len(
                    a["event_sequence_with_0"]) < 190:
                seq_len_type = "medium"
                medium_nb += 1
            elif len(a["event_sequence_with_0"]) >= 190:
Ejemplo n.º 15
0
import pymongo
import csvkit

db = pymongo.MongoClient("localhost", 27017)["mnam"]

headers = [
    "_id", "key_words_movement", "key_words_thema", "key_words_icono",
    "domain_description_mst"
]

tf = db.Artwork.aggregate([{
    "$match": {
        "notEnsemble": True
    }
}, {
    "$project": {
        "key_words_movement": 1,
        "key_words_thema": 1,
        "key_words_icono": 1,
        "domain_description_mst": 1
    }
}])
with open("textfields.csv", "w") as f:
    csv = csvkit.DictWriter(f, fieldnames=headers)
    csv.writeheader()
    csv.writerows(tf)
Ejemplo n.º 16
0
def export_RICentities_FT_comparision(cursor,
                                      output_filename,
                                      table='flow_joined'):

    cursor.row_factory = sqlite3.Row

    select_RICentities = """
    SELECT RICname, type, continent, GPH_code, sum(COALESCE(report.nb_flows,0)) as nb_flows_as_reporting, sum(COALESCE(partner.nb_flows,0)) as nb_flows_as_partner 
  FROM RICentities
    LEFT JOIN (SELECT count(id) as nb_flows, reporting FROM %(table)s where partner not like 'world%%' group by reporting) as report on report.reporting = RICname
    LEFT JOIN (SELECT count(id) as nb_flows, partner FROM %(table)s group by partner) as partner on partner.partner = RICname
  WHERE RICname not LIKe 'World%%'
  group by RICname HAVING nb_flows_as_reporting != 0 OR nb_flows_as_partner != 0
  ORDER BY nb_flows_as_reporting DESC, nb_flows_as_partner DESC 
  """ % {
        'table': table
    }
    cursor.execute(select_RICentities)
    RICentities = {}
    for ric in cursor:
        RICentities[ric[0]] = dict(ric)

    # FT reportings
    select_FT_RICentities_number = """
    SELECT year, count(distinct reporting) as nb_flows_FT
    FROM %s 
    WHERE partner = 'World Federico Tena'
    GROUP BY year
    ORDER BY year
  """ % table
    ft_reportings_by_year = dict(
        (str(y), n) for (y, n) in cursor.execute(select_FT_RICentities_number))

    select_reportings = """
 SELECT reporting, year, ft.FT, count(id) as nb_flows
 FROM %s LEFT JOIN (
    SELECT reporting, year, 1 as FT
    FROM %s 
    WHERE partner = 'World Federico Tena' 
    GROUP BY reporting, year ) as ft
    USING (reporting, year)
 WHERE partner NOT LIKE 'World%%'
  GROUP BY reporting, year;""" % (table, table)

    for (reporting, year, ft, nb_flows) in cursor.execute(select_reportings):
        if reporting not in RICentities:
            print('undocumented RIC %s' % reporting)
            RICentities[reporting] = {
                'RICname': reporting,
                'nb_flows_as_reporting': nb_flows,
                'nb_flows_as_partner': 0
            }
        # y'a un probleme avec nb_flows_as_reporting
        RICentities[reporting][str(
            year)] = "ft_reporting" if ft else "reporting"

    select_partners = """
  SELECT f.partner, f.year, ft.FT, count(id) as nb_flows
 FROM %s as f LEFT JOIN (
    SELECT reporting, year, 1 as FT
    FROM %s 
    WHERE partner = 'World Federico Tena' 
    GROUP BY reporting, year ) as ft
    ON f.partner=ft.reporting AND f.year = ft.year
 WHERE f.partner NOT LIKE 'World%%'
  GROUP BY f.partner, f.year;""" % (table, table)

    for (partner, year, ft, nb_flows) in cursor.execute(select_partners):
        if partner not in RICentities:
            print('undocumented RIC %s' % partner)
            RICentities[partner] = {
                'RICname': partner,
                'nb_flows_as_reporting': 0,
                'nb_flows_as_partner': nb_flows
            }
        if str(year) not in RICentities[partner]:
            RICentities[partner][str(
                year)] = "ft_partner_only" if ft else "partner_only"
            # y'a un bug la !!
            RICentities[partner]['nb_flows_as_partner'] = nb_flows

    cursor.execute(
        'SELECT min(year) as min_year, max(year) as max_year from %s' % table)
    (min_year, max_year) = next(cursor)
    years = [str(y) for y in range(min_year, max_year + 1)]

    nb_entities_in_ft_and_ricardo = dict((y, 0) for y in years)
    nb_entities_in_ricardo_not_in_ft = dict((y, 0) for y in years)

    for r in RICentities.values():
        for y in years:
            if y in r and 'ft' in r[y]:
                nb_entities_in_ft_and_ricardo[y] += 1
            elif y in r:
                nb_entities_in_ricardo_not_in_ft[y] += 1

    with open(output_filename, "w") as f:
        hs = [
            'RICname', 'type', 'continent', 'GPH_code',
            'nb_flows_as_reporting', 'nb_flows_as_partner'
        ] + [y for y in years]
        dw = csvkit.DictWriter(f, fieldnames=hs)
        ft_reportings_by_year['nb_flows_as_partner'] = 'nb FT reportings'
        nb_entities_in_ft_and_ricardo['nb_flows_as_partner'] = 'nb in FT & RIC'
        nb_entities_in_ricardo_not_in_ft[
            'nb_flows_as_partner'] = 'nb in RIC not in FT'
        dw.writeheader()
        dw.writerow(ft_reportings_by_year)
        dw.writerow(nb_entities_in_ft_and_ricardo)
        dw.writerow(nb_entities_in_ricardo_not_in_ft)
        dw.writerows(
            sorted((r for r in RICentities.values()),
                   key=lambda r: -1 *
                   (r['nb_flows_as_reporting'] + r['nb_flows_as_partner'])))
        return 0
    return 1
Ejemplo n.º 17
0
def export_RICentities_csv(cursor, output_filename):

    cursor.row_factory = sqlite3.Row

    select_RICentities = """
  SELECT * from RICentities
  """
    cursor.execute(select_RICentities)
    RICentities = {}
    for ric in cursor:
        RICentities[ric[0]] = {
            'RICname': ric[0],
            'RICtype': ric[1],
            'continent': ric[2],
            'GPH code': ric[3]
        }

    select_reportings = """
  SELECT reporting,
  GROUP_CONCAT(original_reporting,'|') as original_names, 
  GROUP_CONCAT(source_label,'|') as sources, GROUP_CONCAT(distinct year), count(*) as nb_flows
  FROM flow_joined 
  WHERE partner NOT LIKE 'World%'
  GROUP BY reporting"""

    for reporting in cursor.execute(select_reportings):
        RICentities[reporting[0]]['names in source (reporting)'] = "; ".join(
            set(reporting[1].split('|')))
        RICentities[reporting[0]]['sources (reporting)'] = "; ".join(
            set(reporting[2].split('|')))
        RICentities[reporting[0]]['bilateral periods (reporting)'] = ','.join(
            '-'.join(str(e) for e in p)
            for p in reduce_years_list_into_periods(reporting[3].split(',')))
        RICentities[reporting[0]]['nb flows (reporting)'] = reporting[4]
        RICentities[reporting[0]]['total nb flows'] = reporting[4]

    select_partners = """
  SELECT partner, 
  GROUP_CONCAT(original_partner,'|') as original_names, 
  GROUP_CONCAT(source_label,'|') as sources, GROUP_CONCAT(distinct year), count(*) as nb_flows
  FROM flow_joined
  WHERE partner NOT LIKE 'World%'
  GROUP BY partner"""

    for partner in cursor.execute(select_partners):
        RICentities[partner[0]]['names in source (partner)'] = "; ".join(
            set(partner[1].split('|')))
        RICentities[partner[0]]['sources (partner)'] = "; ".join(
            set(partner[2].split('|')))
        RICentities[partner[0]]['bilateral periods (partner)'] = ','.join(
            '-'.join(str(e) for e in p)
            for p in reduce_years_list_into_periods(partner[3].split(',')))
        RICentities[partner[0]]['nb flows (partner)'] = partner[4]
        if 'total nb flows' in RICentities[partner[0]]:
            RICentities[partner[0]]['total nb flows'] += partner[4]
        else:
            RICentities[partner[0]]['total nb flows'] = partner[4]

    with open(output_filename, "w") as f:
        hs = [
            'RICname', 'RICtype', 'continent', 'GPH code', 'total nb flows',
            'nb flows (reporting)', 'nb flows (partner)',
            'names in source (reporting)', 'names in source (partner)',
            'bilateral periods (reporting)', 'bilateral periods (partner)',
            'sources (reporting)', 'sources (partner)'
        ]
        dw = csvkit.DictWriter(f, fieldnames=hs)
        dw.writeheader()
        dw.writerows(
            sorted((r for r in RICentities.values() if 'total nb flows' in r),
                   key=lambda r: -1 * r['total nb flows']))
        return 0
    return 1
Ejemplo n.º 18
0
    key=lambda e: (e["sourcetype"], e["year"], e["direction"]
                   if "direction" in e else "", e["exportsimports"]
                   if "exportsimports" in e else "", e["numrodeligne"]
                   if ("numrodeligne" in e and e["numrodeligne"]) else "", e[
                       "marchandises"], e["pays"] if "pays" in e else ""))

# Cleaning sources
for row in sources_aggregation:
    for k in row:
        row[k] = clean(row[k])

headers = set(headers)
headers = [h for h in headers if h not in ordered_headers]
headers = ordered_headers + headers
with open(output_filename, "w") as output_file:
    agg_csv = csvkit.DictWriter(output_file, headers, encoding="utf-8")
    agg_csv.writeheader()
    agg_csv.writerows(sources_aggregation)

#csvsort  -c SourceType,year,direction,exportsimports,numrodeligne,marchandises,pays "$f" > last_ordered.csv

#taking care of 0/missing in "values" and in "prix_unitaire"


def clean_float_string(f):
    f = re.sub(r"[,,、،﹐﹑]", ".", f)
    f = re.sub(r"[\s ]", "", f)
    return f


with open(output_filename, "r") as output_file:
Ejemplo n.º 19
0
                    r = csvkit.DictReader(source_file)
                    headers += r.fieldnames

headers = set(headers)
headers = [h for h in headers if h not in ordered_headers]
headers = ordered_headers + headers

for extra_header in [
        "value_as_reported", "computed_value", "replace_computed_up"
]:
    if extra_header not in headers:
        headers += [extra_header]

# Then we actually read and write the lines
with open(output_filename, "w") as output_file:
    writer = csvkit.DictWriter(output_file, headers, encoding="utf-8")
    writer.writeheader()

    for (dirpath, dirnames, filenames) in os.walk(directory):
        if not sum(dirpath == os.path.join(directory, b) for b in black_list):
            for csv_file_name in filenames:
                ext = csv_file_name.split(
                    ".")[-1] if "." in csv_file_name else None
                if ext == "csv":
                    print "%s in %s" % (csv_file_name, dirpath)

                    filepath = os.path.join(dirpath, csv_file_name)

                    with open(filepath, "r") as source_file:
                        r = csvkit.DictReader(source_file)
Ejemplo n.º 20
0
# unique arworks
unique_artworks = db.Artwork.aggregate([
    {"$match": {"type":{'$in':['individual','nonseparable']}}},
    {"$project":project}
    ])
separable_artworks_groups = db.Artwork.aggregate([
    {"$match": {"type":'separable'}},
    {'$group': dict( [('_id','ensemble_id')] + [(k,{'$first':'$%s'%k}) for k,v in project.items()])},
    {"$project":project}
    ])


artworks = list(unique_artworks)+list(separable_artworks_groups)

for artwork in artworks:
    # cleaning acquisition mode
    artwork['new_acquisition_mode'] = cleaning.acquisition_mode_cleaning(artwork['acquisition_mode'])
    # creation date
    artwork['creation_year'] = cleaning.creation_date_cleaning(artwork['date_creation']) if 'date_creation' in artwork else ''
    try:
        artwork['acq_crea_diff'] = int(artwork['acquisition_year'])-int(artwork['creation_year'])
    except:
        artwork['acq_crea_diff'] = None
    artwork['authors'] = '|'.join(artwork['authors'])


with open("unique_artworks.csv", "w") as f:
    artworks_csv = csvkit.DictWriter(f,fieldnames = headers.values())
    artworks_csv.writeheader()
    # setting human readable column names
    artworks_csv.writerows(( { headers[k]:v for k,v in artwork.items()} for artwork in artworks))
Ejemplo n.º 21
0
def analyzeAllEEG():
    """
    reads the entire table of eeg reports (input_file_name) which is a csv file and creates a new csv file with new columns
    which will include the likelihood score of it containing seizures/ed. 
    :return: nothing. this function writes to the outfile
    """
    i = 1
    with open(input_file_name) as cf:
        # needed to replace null lines
        reader = csv.DictReader(x.replace('\0', '') for x in cf)
        # optional starting offset in case program crashes while analyzing
        # for line in reader:
        #    i += 1
        #    if (i>9425):
        #        i=1
        #        break

        outfieldnames = reader.fieldnames
        outfieldnames.append(
            'notebody')  # body of the procedure note, excludies impression
        outfieldnames.append('examno')  # exam number ie A033
        outfieldnames.append('impressionBody')  # impression block
        outfieldnames.append(
            'impression'
        )  # just says whether or not the eeg report was abnormal or normal
        # the algorithm only looks at specific commonly used phrases
        outfieldnames.append(
            'notetype')  # spot v ambulatory v long term monitoring...
        outfieldnames.append(
            'duration'
        )  # duration in the format of Days, Hours:Minutes:Seconds
        outfieldnames.append(
            'impressionType'
        )  # ed, dc, sz - marks each report with these tags based of the sentiment score
        # next two column of features are based of a better algorithm that will try to understand the syntax of each
        # sentence instead of just looking for specific phrases (dubbed "sentiment" analysis)
        outfieldnames.append('epileptiformScore')
        outfieldnames.append('seizureScore')
        writer = csv.DictWriter(out_file,
                                fieldnames=outfieldnames,
                                restval='*')
        writer.writeheader()

        for line in reader:
            i += 1
            eeg_no = ""
            m = re_eegno.search(line['note'])
            # write the notebody and impression
            # impressions are determined by matching for specific phrases
            # this method was not very reliable
            if m:
                line['notebody'] = line['note'][:m.start()]
                eeg_no = m.group('eegno')
                findTrueImpression(eeg_no, line, i)
                setImpressionBody(m, line, i)
            else:
                # try a looser find that may introduce more false information (more sensitive less specific pattern)
                m = re_eegnoLoose.search(line['note'])
                if m:
                    line['notebody'] = line['note'][:m.start()]
                    eeg_no = m.group('eegno')
                    findTrueImpression(eeg_no, line, i)
                elif "preliminary" in line['note'].lower():
                    line['impression'] = "prelim"
                elif "prelim" in line['note'].lower():
                    line['impression'] = "prelim"
                #else:
                #print("###IMPRESSION BLOCK MISSING-" + repr(i) + line['note'])
                #print()

            # figure out the duration by looking up the duration of the study based on the exam number from NK database
            m = re_eegnoType.search(line['note'])
            line['duration'] = None
            if m:
                line['duration'] = masterCSV_daterange(m.group(0))
            if line['duration'] is None:
                # write the duration by some other messier means through regular expression matching
                m = re_eegnoDuration.search(line['note'])
                if m:
                    match = m.group(1)
                    line['duration'] = match
                else:
                    m = re_eegnoDuration2.search(line['note'])
                    if m:
                        match = m.group(1)
                        if "hour" in match:
                            line['duration'] = m.group(2) + ':00:00'
                        elif "day" in match:
                            line['duration'] = repr(
                                int(m.group(3)) * 24) + ':00:00'
                    else:
                        m = re_eegnoDuration3.search(line['note'])
                        if m:
                            line['duration'] = durationByDates(
                                m.group(1), m.group(2))
                        else:
                            m = re_eegnoDuration4.search(line['note'])
                            if m:
                                line['duration'] = durationByHoursHopefully(
                                    line['note'])
                            else:
                                m = re_eegnoDuration5.search(line['note'])
                                if m:
                                    print(m.group(1))
                                    line['duration'] = durationByDates(
                                        m.group(1), m.group(2))

            # write the notetype
            m = re_eegnoType.search(line['note'])
            if m:
                match = m.group(0)
                match = match.lower()
                line['examno'] = match
                if ("s" in match):
                    line['notetype'] = "spot"
                elif ("v" in match):
                    line['notetype'] = "ceeg"
                elif ("a" in match):
                    line['notetype'] = "ambu"
                elif ("f" in match):
                    line['notetype'] = "spot inpt"
                elif ("e" in match):
                    line['notetype'] = "ceeg"
                else:
                    line['notetype'] = "unk"

            # determine procedure type by duration
            # make sure ambulatories are >24
            dtformat = '%H:%M:%S'
            try:
                t = datetime.datetime.strptime(line['duration'], dtformat)
                totMinutes = t.minute + t.hour * 60
                # determine the unknown report based off the duration only
                if line['duration'] is not None:
                    if line['notetype'] is None:
                        if totMinutes < 100 and totMinutes > 18:
                            line['notetype'] = "spot?"
                            pass
                if "day" not in line['duration']:
                    if "ambu" is line['notetype']:
                        if totMinutes < 1320 and totMinutes > 1:
                            line['notetype'] = "longspot"
            except:
                pass

            # write sentiment score
            line['seizureScore'] = sentimentAnalysisForSeizure(
                line['impressionBody'])
            line['epileptiformScore'] = sentimentAnalysisForEpileptiform(
                line['impressionBody'])

            # write the abnormality type
            line['impressionType'] = ""
            if int(line['seizureScore']) < 0:
                line['impressionType'] += ' sz'
            if int(line['epileptiformScore']) < 0:
                line['impressionType'] += ' ed'

            writer.writerow(line)
Ejemplo n.º 22
0
comments = retrieve_comments(service,
                             '1cPuHCoLshw_srl-OG7dzdk4kDr7mwPGUUgW55JRPQlk')

output = []

slugs = {
    'Jennifer Lee': 'jennifer-lee',
    'Nikole Hannah-Jones': 'nikole-hannah-jones',
    'Clay Risen': 'clay-risen',
    'Nicholas Espiritu': 'nicholas-espiritu',
    'Samuel Bagenstos': 'samuel-bagenstos',
    'Anonymous': 'nina-totenberg',
    'PHS': 'phs',
    'awheeler': 'mark-updegrove',
}

comments = sorted(comments, key=lambda c: c['anchor'])

for comment in comments:
    d = {}
    name = comment['author']['displayName']

    d['author_key'] = slugs[name]
    d['content'] = comment['content']
    d['cited'] = comment['context']['value']
    output.append(d)

with open('comments.csv', 'w') as f:
    writer = csv.DictWriter(f, ['author_key', 'content', 'cited'])
    writer.writeheader()
    writer.writerows(output)
import csvkit

with open('../../csv_data/sources.csv', 'r') as sf:
    sources = csvkit.DictReader(sf)
    with open('new_sources.csv', 'r') as nsf:
        new_sources = csvkit.DictReader(nsf)
        sourcesSlugs = set(s['slug'] for s in sources)
        newSourcesSlugs = set(s['slug'] for s in new_sources)
        inSourceNotInNew = sourcesSlugs - newSourcesSlugs
        inNewNotInSource = newSourcesSlugs - sourcesSlugs
        with open('sourceTroubles.csv', 'w') as of:
            sourceTroubles = csvkit.DictWriter(of, ['source', 'set'])
            sourceTroublesData = [{
                'source': s,
                'set': 'inSourceNotInNew'
            } for s in inSourceNotInNew]
            sourceTroublesData += [{
                'source': s,
                'set': 'inNewNotInSource'
            } for s in inNewNotInSource]
            sourceTroublesData = sorted(sourceTroublesData,
                                        key=lambda e: e['source'])
            sourceTroubles.writeheader()
            sourceTroubles.writerows(sourceTroublesData)
Ejemplo n.º 24
0
import requests
import time
import csvkit as csv
from bs4 import BeautifulSoup as Soup

URL = 'http://espn.go.com/golf/leaderboard11/controllers/ajax/playerDropdown'

fieldnames = ['playerId', 'tournamentId', 'hole', 'par', 'score']
outfile = open('data_out.csv', 'w+')
outwriter = csv.DictWriter(outfile, fieldnames=fieldnames)
outwriter.writeheader()


def parseTable(table, playerId, tournamentId):
    data = [r.find_all('td') for r in table.find_all('tr')]
    out = []
    for i, x in enumerate(data[0]):
        if not x.text.isdigit():
            continue

        # skip any weird values
        try:
            int(data[2][i].text)
        except ValueError as err:
            continue

        row = {
            'playerId': playerId,
            'tournamentId': tournamentId,
            'hole': int(x.text),
            'par': int(data[1][i].text),
Ejemplo n.º 25
0
}

project = {
    'type':1,
    'authors_birth_death':1,
    'name.notice':1,
    'name_complement':1,
    'gender':1,
    'nationality':1,
    'artworks':1
}

# unique arworks
authors = list(db.Author.aggregate([
    {"$project":project}
    ]))
for author in authors:
    # cleaning acquisition mode
    if 'authors_birth_death' in author:
        author.update(cleaning.artist_birthdeath_parsing(author['authors_birth_death']))
        del author['authors_birth_death']
    author['artworks'] = '|'.join(author['artworks'])
    author['name.notice']= author["name"]["notice"]
    del author["name"]

    
with open("authors.csv", "w") as f:
    authors_csv = csvkit.DictWriter(f,fieldnames = headers.values())
    authors_csv.writeheader()
    # setting human readable column names
    authors_csv.writerows(( { headers[k]:v for k,v in author.items()} for author in authors))
Ejemplo n.º 26
0
            int(len(artists_profils) * artists_random_select_percentage))
    print "filtered artists to %s" % len(artists_profils)
    artists_comparision_matrix = []
    print "computing..."
    counter = 0
    # n! / r! / (n-r)!
    total = math.factorial(len(artists_profils)) / math.factorial(
        2) / math.factorial(len(artists_profils) - 2)
    print "computing... %s combinations" % total
    for artists in itertools.combinations(artists_profils, 2):
        counter += 1
        if counter % 10000 == 0:
            print "processed %s combinations %.4f%%" % (counter,
                                                        counter / total * 100)
            with open(output_filename, "w") as ff:
                output = csvkit.DictWriter(
                    ff, fieldnames=['Source', 'Target', 'Weight'])
                output.writeheader()
                output.writerows(artists_comparision_matrix)

        kval = compare(artists)
        #print kval
        artists_comparision_matrix.append({
            "Source": artists[0]["name"],
            "Target": artists[1]["name"],
            "Weight": kval
        })
    print "done"
    with open(output_filename, "w") as ff:
        output = csvkit.DictWriter(ff,
                                   fieldnames=['Source', 'Target', 'Weight'])
        output.writeheader()