コード例 #1
0
ファイル: script.py プロジェクト: floriano76/verdinha
def generateDoacao(arquivo):
    '''Utiliza os arquivos ReceitaCand.txt das Prestações de Contas de 2010
	   http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/prestacao_contas_2010.zip
	'''
    doacoes_raw = open(arquivo, 'r')
    doacoes_raw = csvkit.DictReader(doacoes_raw,
                                    encoding='iso-8859-1',
                                    delimiter=';')

    r = {}
    for d in doacoes_raw:
        _id = d['CPF do candidato']
        if not r.has_key(_id):
            r[_id] = {
                '_id': _id,
                'nome': d['Nome candidato'],
                'numero': d[u'Número candidato'],
                'partido': d['Sigla Partido'],
                'uf': d['UF'],
                'doacoes': {},
                'total': 0
            }

        r[_id]['total'] += float(d['Valor receita'].replace(',', '.'))
        if not r[_id]['doacoes'].has_key(d['CPF/CNPJ do doador']):
            r[_id]['doacoes'][d['CPF/CNPJ do doador']] = {
                'nome': d['Nome do doador'],
                'valor': float(d['Valor receita'].replace(',', '.'))
            }
        else:
            r[_id]['doacoes'][d['CPF/CNPJ do doador']]['valor'] += float(
                d['Valor receita'].replace(',', '.'))

    print 'Saving...'
    mongo_save(r)
コード例 #2
0
ファイル: theme_evolution_data.py プロジェクト: medialab/MNAM
def create_by_creationdate_small(collection, filters):
    tag_date = defaultdict(lambda: 0)

    with open(collection, 'r') as f:
        reader = csvkit.DictReader(f)
        for row in reader:
            c_date = row['Year creation']
            c_tag = row['tag_thema'].split(', ')

            if not c_date.isdigit() or int(c_date) < 1868:
                pass
            else:
                if len(c_tag) > 1:
                    for t in c_tag:
                        if t in filters:
                            tag_date[(t, c_date)] += 1
                else:
                    if c_tag[0] in filters:
                        tag_date[(c_tag[0], c_date)] += 1

    # Write output file
    with open('theme_creationDate_small.csv', 'w') as f:
        writer = csvkit.DictWriter(
            f, fieldnames=['Theme', 'Year creation', 'Weight'])
        writer.writeheader()
        for k in tag_date:
            output_row = {
                'Theme': k[0],
                'Year creation': '01/01/%s' % k[1],
                'Weight': tag_date[k]
            }
            writer.writerow(output_row)
コード例 #3
0
    def _open_data(self, in_file):
        _regions = {}
        with open(in_file) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                _regions[row[self.key]] = row

            return _regions
コード例 #4
0
ファイル: test_csvkit.py プロジェクト: xtmgah/csvkit
    def test_reader_alias(self):
        reader = csvkit.DictReader(self.f)

        self.assertEqual(reader.next(), {
            u'a': u'1',
            u'b': u'2',
            u'c': u'3'
        })
コード例 #5
0
def iterate_and_parse_files(sub_directory):
    """ Merge all csv files in folder to list of dicts
    """
    data = DictList()
    folder = os.getcwd() + "/" + sub_directory
    for file_name in os.listdir(folder):
        reader = csv.DictReader(open(folder + file_name))
        data += list(reader)
    return data
コード例 #6
0
def merge_csv_files(filelist):
    """ Pass a list of csv files and merge to DictList
    """
    data = tables.DictList()
    for file_name in filelist:
        reader = csv.DictReader(open(file_name))
        data += list(reader)

    return data
コード例 #7
0
ファイル: artwork_tagging.py プロジェクト: medialab/MNAM
def get_artist_names(authors, collection):
    '''
	'''
    id_names_map = defaultdict(lambda: '')
    with open(authors, 'r') as f:
        reader = csvkit.DictReader(f)
        for row in reader:
            id_names_map[row['Id artist']] = row['name']
    return id_names_map
コード例 #8
0
def importaPrestacoes2014(arquivo, mugshot):
	from pymongo import MongoClient
	client = MongoClient()
	db = client.verdinha
	col = db.politicos
	
	raw = open(arquivo, 'r')
	doacoes_raw = csvkit.DictReader(raw, encoding='iso-8859-1', delimiter=';')
	c = doacoes_raw.next()
	if c.has_key('prestacao'):
		#print arquivo + " nao entregue"
		return None
	c['Nome do Candidato'] = unidecode.unidecode(c['Nome do Candidato'])
	p = col.find_one({'nome' : c['Nome do Candidato']})
	if p:
		raw.seek(0) #rewind!
		doacoes_raw = csvkit.DictReader(raw, encoding='iso-8859-1', delimiter=';')
		for d in doacoes_raw:
			#p['mugshot'] = mugshot
			if not p['candidaturas'].has_key('2014'):
				p['candidaturas']['2014'] = {
							'ano' : 2014,
							'cargo' : d['Candidatura'],
							'situacao' : 'Candidato',
							'numero' : d[u'Número do Candidato'],
							'partido' : d['Partido'],
							'uf' : estados[d['Unidade Eleitoral']],
							'doacoes' : {},
							'total' : 0
						}
				
			p['candidaturas']['2014']['total'] += float(d['Valor R$'].strip('R$ ').strip('\.').replace('.','').replace(',','.'))
			cnpj_id = d['CPF/CNPJ'].replace('/','').replace('-','').replace('.','')
			if not p['candidaturas']['2014']['doacoes'].has_key(cnpj_id):	
				p['candidaturas']['2014']['doacoes'][cnpj_id] = {
					'nome' : d['Doador'],
					'cnpj' : d['CPF/CNPJ'],
					'valor' : float(d['Valor R$'].strip('R$ ').strip('\.').replace('.','').replace(',','.'))
				}
				#todo doador originario
			else:
				p['candidaturas']['2014']['doacoes'][cnpj_id]['valor'] += float(d['Valor R$'].strip('R$ ').strip('\.').replace('.','').replace(',','.'))
			col.update({'_id' : p['_id']}, p, upsert=True)
コード例 #9
0
ファイル: artwork_tagging.py プロジェクト: medialab/MNAM
def keyword_list(collection):
    '''
		Returns the list of theme keywords used in a collecion
	'''
    keywords = set()
    with open(collection, 'r') as f:
        reader = csvkit.DictReader(f)

        for row in reader:
            keywords.update(row['themas'].split(', '))

    return list(keywords)
コード例 #10
0
ファイル: artwork_tagging.py プロジェクト: medialab/MNAM
def get_tags_from(gephi_csv):
    '''
		Returns the tags from gephi.csv
	'''
    tags = set()
    with open(gephi_csv, 'r') as f:
        reader = csvkit.DictReader(f)

        for row in reader:
            tags.update(row['tag'])

    return list(tags)
コード例 #11
0
def load_forms():
    """
    Load all the FilingForm objects from the source CSV.
    """
    this_dir = os.path.dirname(__file__)

    # Read in forms
    form_path = os.path.join(this_dir, 'forms.csv')
    with open(form_path, 'r') as form_obj:
        form_reader = csvkit.DictReader(form_obj)
        form_list = [FilingForm(**row) for row in form_reader]

    # Read in sections
    section_path = os.path.join(this_dir, 'sections.csv')
    with open(section_path, 'r') as section_obj:
        section_reader = csvkit.DictReader(section_obj)
        for section in section_reader:
            form = next((x for x in form_list if x.id == section['form_id']))
            form.add_section(**section)

    # Pass it out
    return form_list
コード例 #12
0
ファイル: import_crs_2015.py プロジェクト: DeppSRL/open-aid
    def handle_label(self, crs_filename, **options):
        verbosity = options['verbosity']
        if verbosity == '0':
            self.logger.setLevel(logging.ERROR)
        elif verbosity == '1':
            self.logger.setLevel(logging.WARNING)
        elif verbosity == '2':
            self.logger.setLevel(logging.INFO)
        elif verbosity == '3':
            self.logger.setLevel(logging.DEBUG)

        # Gli argomenti forniti sono i nomi dei file CRS da lavorare
        self.logger.info(u"Start import %s" % YEAR)
        start_time = time.time()
        i = 0

        if options.get('clean') and self.delete_projects():
            raise CommandError("Import aborted")

        self.all_codelists = dict([(cl.code_list,
                                    dict(cl.objects.values_list('code', 'pk')))
                                   for cl in codelist_models.CODE_LISTS])

        rows = projects = activities = 0
        set_autocommit(False)
        try:
            with open(crs_filename, 'r') as crs_file:
                for rows, activity in enumerate(csvkit.DictReader(crs_file),
                                                start=1):
                    activity, new_project = self.load_activity(activity, rows)
                    if activity:
                        activities += 1
                        self.logger.debug("Imported row: %d" % (activities))
                        if new_project:
                            projects += 1
                    if rows % 50 == 0:
                        commit()
        except KeyboardInterrupt:
            commit()
            self.logger.critical("Command execution aborted.")
        finally:
            self.logger.info("Total projects: %d" % projects)
            self.logger.info("Total activities: %d" % activities)
            self.logger.info("Total rows: %d" % rows)
            self.logger.info("Execution time: %d seconds" %
                             (time.time() - start_time))
            commit()

        self.logger.info(u"Finish import %s" % YEAR)
コード例 #13
0
ファイル: FedericoTena.py プロジェクト: CDonnees/ricardo
def import_flows(filename,imp_exp,c):
	with open(filename) as f:
		importscsvs=csvkit.DictReader(f)
		for line in importscsvs:
			year=line["year"]
			for reporting,flow in line.iteritems():
				if flow !="":
					try:
						flow=float(flow.replace(",","."))
					except :
						print year,reporting,"'%s'"%flow
						continue
					# remove 0 values
					if reporting!="year" and flow!=0.0:
						data=["FEDERICO-TENA",flow,"1000000","sterling pound",int(year),reporting,"World Federico-Tena",imp_exp,"gen","total_federicotena"]
						c.execute("INSERT INTO flows (source, flow, unit, currency, year, reporting, partner, export_import, special_general, world_trade_type) VALUES (?,?,?,?,?,?,?,?,?,?)",data)
コード例 #14
0
ファイル: dimension.py プロジェクト: jensfinnas/smhi_scraper
    def _categories_from_file(self):
        """ Get categories from cached file
        """
        categories = {}
        file_path = os.path.join("vantetider/data", self.dataset.id,
                                 self.id + ".csv")
        with open(file_path) as f:
            for row in csv.DictReader(f, encoding="utf-8"):
                cat = Category(row["id"], label=row["label"])
                for attr, value in row.iteritems():
                    if attr not in ["id", "label"]:
                        setattr(cat, attr, value)

                categories[cat.id] = cat

        return categories
コード例 #15
0
ファイル: FedericoTena.py プロジェクト: CDonnees/ricardo
def import_fredericotena(c):
	FT_PATH = "in_data/FredericoTena"
	ENTITIES_CSV = "FredericoTena_entities.csv"
	IMPORTS_CSV = "FredericoTena_imports.csv"
	EXPORTS_CSV = "FredericoTena_exports.csv"


	# create source done
	source_id="FEDERICO-TENA"
	source_authors="Federico G. & A. Tena-Junguito"
	source_type="estimation"
	source_edition_year="2016"
	source_url="http://www.ehes.org/EHES_93.pdf"
	source_title="World trade, 1800-1938: a new data-set, EHES Working Paper 93"
	c.execute("INSERT INTO source_types (acronym,reference,type,author,URL) VALUES (?,?,?,?,?)",(source_id,source_title,source_type,source_authors,source_url))
	c.execute("INSERT INTO sources (slug,acronym,name,edition_date) VALUES (?,?,?,?)",(source_id,source_id,source_title,source_edition_year))
	print "created FT source"

	# read entities

	ricslug=lambda _: re.sub("[ ()/]","",re.sub("&","_",_))

	with open(os.path.join(FT_PATH,ENTITIES_CSV)) as f:
		entitiescsv=csvkit.DictReader(f)
		for entity in entitiescsv:
			if entity["new"]!="":
				# create new entities
				print "inserting new entity %s"%entity["ricname"]
				# todo add continent
				c.execute("INSERT OR IGNORE INTO RICentities (RICname,type,continent,COW_code,slug) VALUES (?,?,?,?,?)",(entity["ricname"],entity["rictype"],"?",entity["cow"],ricslug(entity["ricname"])))
				# todo check for the group
			c.execute("INSERT OR IGNORE INTO entity_names (original_name,RICname) VALUES (?,?) ",(entity["Polity Federico-Tena"],entity["ricname"]))

	# add World Frederico Tena entity
	c.execute("INSERT OR IGNORE INTO entity_names (original_name,RICname) VALUES (?,?) ",("World Federico-Tena","World Federico-Tena"))
	c.execute("""INSERT OR IGNORE INTO RICentities (RICname,type,continent,slug) VALUES ("World Federico-Tena","geographical_area","World", "WorldFedericoTena")""")
			
	# read import
	import_flows(os.path.join(FT_PATH,IMPORTS_CSV),"imp",c)
	# read export
	import_flows(os.path.join(FT_PATH,EXPORTS_CSV),"exp",c)
コード例 #16
0
    def handle_label(self, crs_file, **options):

        start_time = time.time()
        i = 0
        translations = 0
        field = options['field']

        languages = [lang[0].split('-')[0] for lang in settings.LANGUAGES]
        if options['lang']:
            if options['lang'] not in languages:
                raise CommandError("Invalid language code '%s'. Try: %s" %
                                   (options['lang'], ', '.join(languages)))
            languages = [
                options['lang'],
            ]

        self.stdout.write('FIELD: %s' % field)
        self.stdout.write('LANGUAGES: %s' % languages)

        with open(crs_file, 'r') as crs_file:

            rows = csvkit.DictReader(crs_file, encoding='utf-8')

            for i, row in enumerate(rows, start=1):
                updates, matches = self.translate(row,
                                                  field,
                                                  languages,
                                                  override=options['override'])
                if matches == 0:
                    self.stdout.write(
                        "\rRow %d non corrisponde a nessuna Activity" % (i))
                else:
                    self.stdout.write("\r%s: Translated activities %d    " %
                                      (i, updates),
                                      ending='')
                    self.stdout.flush()
                translations += updates

        self.stdout.write("\nTotal rows: %d" % i)
        self.stdout.write("Execution time: %d seconds" %
                          (time.time() - start_time))
コード例 #17
0
def import_flows(filename,imp_exp,c,ft_entities,ft_rates,ft_source):
	with open(filename) as f:
		importscsvs=csvkit.DictReader(f)
		for line in importscsvs:
			year=line["year"]
			for reporting,flow in line.iteritems():
				if flow !="":
					try:
						flow=float(flow.replace(",","."))*ft_rates[year]
					except :
						print year,reporting,"'%s'"%flow
						continue
					# remove 0 values
					if reporting!="year" and flow!=0.0:
						reporting = reporting.strip().lower()
						if reporting in ft_entities:
							data=[ft_source,flow,"1000000","us dollar",int(year),reporting,"World Federico-Tena",imp_exp,"gen","total_federicotena"]
							c.execute("INSERT INTO flows (source, flow, unit, currency, year, reporting, partner, export_import, special_general, world_trade_type) VALUES (?,?,?,?,?,?,?,?,?,?)",data)
							data=["us dollar",int(year),reporting,"us dollar"]
							c.execute("INSERT OR IGNORE INTO currencies (currency, year, reporting, modified_currency) VALUES (?,?,?,?)",data)
						else:
							print "MISSING '%s' in ft entities"%reporting
コード例 #18
0
ファイル: artwork_tagging.py プロジェクト: medialab/MNAM
def tag_artworks(collection, output, keyword_tag_map):
    '''
		tag all of the artworks of a collection using 
		gephi csv
		Also add name of the author
	'''
    with open(collection, 'r') as fi:
        with open(output, 'w') as fo:

            # Init csv reader
            reader = csvkit.DictReader(fi)
            header = reader.fieldnames

            # Init new csv with updated header
            #header += ['tag_ico', 'tag_theme', 'tag_mat']
            header.append('tag_thema')
            header.append('name')
            writer = csvkit.DictWriter(fo, fieldnames=header)
            writer.writeheader()

            id_names_map = get_artist_names(
                '/home/akira/Documents/Dev/Datasprint/authors.csv', collection)

            for input_row in reader:
                # Init dict to write
                output_row = dict.fromkeys(header)
                output_row.update(input_row)

                # Tag
                thema_words = input_row['themas'].split(', ')
                tags = get_tags_from_field(thema_words, keyword_tag_map)
                output_row['tag_thema'] = ', '.join(tags)

                # Add author's name
                output_row['name'] = id_names_map[
                    input_row['Id artists']].replace('"', '')

                # Write row
                writer.writerow(output_row)
コード例 #19
0
def run():
    from openaid.projects.models import Initiative, Project
    for i, row in enumerate(csvkit.DictReader(open('initiatives_full.csv')),
                            start=1):

        initiative, created = Initiative.objects.get_or_create(
            code=row['code'].zfill(6),
            defaults={
                'title_it': row['title'],
                'country':
                row['country'] if row['country'] != '(vuoto)' else '',
                'total_project_costs': row['total'],
                'grant_amount_approved': row['grant'],
                'loan_amount_approved': row['loan'],
            })

        projects = Project.objects.filter(number__startswith='%s/' %
                                          initiative.code).update(
                                              initiative=initiative)

        print '%d] Created %s%s' % (i, repr(initiative),
                                    (' associated with %d projects' %
                                     projects) if projects else '')
コード例 #20
0
ファイル: theme_evolution_data.py プロジェクト: medialab/MNAM
def create_by_acquisitiondate_acquisitionmode(collection):
    '''
	'''
    tag_acq_date = defaultdict(lambda: 0)

    with open(collection, 'r') as f:
        reader = csvkit.DictReader(f)
        for row in reader:
            c_date = row['Year acquisition']
            c_tag = row['tag_thema'].split(', ')
            c_mode = row['Mode acquisition (new categories)']
            if not c_date.isdigit() or c_mode == '' or int(c_date) < 1868:
                pass
            else:
                if len(c_tag) > 1:
                    for t in c_tag:
                        tag_acq_date[(t, c_mode, c_date)] += 1
                else:
                    tag_acq_date[(c_tag[0], c_mode, c_date)] += 1

    # Write output file
    with open('theme_acqDate_acqMode_date.csv', 'w') as f:
        writer = csvkit.DictWriter(f,
                                   fieldnames=[
                                       'Theme', 'Year acquisition',
                                       'Mode acquisition (new categories)',
                                       'Weight'
                                   ])
        writer.writeheader()
        for k in tag_acq_date:
            output_row = {
                'Theme': k[0],
                'Year acquisition': '01/01/%s' % k[2],
                'Mode acquisition (new categories)': k[1],
                'Weight': tag_acq_date[k]
            }
            writer.writerow(output_row)
コード例 #21
0
    def handle_label(self, crs_filename, **options):
        """
        Gli argomenti forniti sono i nomi dei file CRS da lavorare
        """
        start_time = time.time()
        i = 0

        if options.get('clean') and not self.delete_projects():
            raise CommandError("Import aborted")

        self.all_codelists = dict([(cl.code_list,
                                    dict(cl.objects.values_list('code', 'pk')))
                                   for cl in codelist_models.CODE_LISTS])

        rows = projects = activities = 0

        try:
            with open(crs_filename, 'r') as crs_file:
                for rows, activity in enumerate(csvkit.DictReader(crs_file),
                                                start=1):
                    activity, new_project = self.load_activity(activity, rows)
                    if activity:
                        activities += 1
                        self.stdout.write("\rImported row: %d" % (activities),
                                          ending='')
                        self.stdout.flush()
                        if new_project:
                            projects += 1
        except KeyboardInterrupt:
            self.stdout.write("\nCommand execution aborted.")
        finally:
            self.stdout.write("\nTotal projects: %d" % projects)
            self.stdout.write("Total activities: %d" % activities)
            self.stdout.write("Total rows: %d" % rows)
            self.stdout.write("Execution time: %d seconds" %
                              (time.time() - start_time))
コード例 #22
0
import csvkit
import usaddress

# expected format in input.csv: first column 'id', second column 'address'
with open('input.csv', 'r') as f:
    reader = csvkit.DictReader(f)

    all_rows = []
    for row in reader:
        try:
            parsed_addr = usaddress.tag(row['address'])
            row_dict = parsed_addr[0]
        except:
            row_dict = {'error': 'True'}

        row_dict['id'] = row['id']
        all_rows.append(row_dict)

field_list = [
    'id', 'AddressNumber', 'AddressNumberPrefix', 'AddressNumberSuffix',
    'BuildingName', 'CornerOf', 'IntersectionSeparator', 'LandmarkName',
    'NotAddress', 'OccupancyType', 'OccupancyIdentifier', 'PlaceName',
    'Recipient', 'StateName', 'StreetName', 'StreetNamePreDirectional',
    'StreetNamePreModifier', 'StreetNamePreType', 'StreetNamePostDirectional',
    'StreetNamePostModifier', 'StreetNamePostType', 'SubaddressIdentifier',
    'SubaddressType', 'USPSBoxGroupID', 'USPSBoxGroupType', 'USPSBoxID',
    'USPSBoxType', 'ZipCode', 'error'
]

with open('output.csv', 'wb') as outfile:
    writer = csvkit.DictWriter(outfile, field_list)
コード例 #23
0
import csvkit

with open('../../csv_data/sources.csv', 'r') as sf:
    sources = csvkit.DictReader(sf)
    with open('new_sources.csv', 'r') as nsf:
        new_sources = csvkit.DictReader(nsf)
        sourcesSlugs = set(s['slug'] for s in sources)
        newSourcesSlugs = set(s['slug'] for s in new_sources)
        inSourceNotInNew = sourcesSlugs - newSourcesSlugs
        inNewNotInSource = newSourcesSlugs - sourcesSlugs
        with open('sourceTroubles.csv', 'w') as of:
            sourceTroubles = csvkit.DictWriter(of, ['source', 'set'])
            sourceTroublesData = [{
                'source': s,
                'set': 'inSourceNotInNew'
            } for s in inSourceNotInNew]
            sourceTroublesData += [{
                'source': s,
                'set': 'inNewNotInSource'
            } for s in inNewNotInSource]
            sourceTroublesData = sorted(sourceTroublesData,
                                        key=lambda e: e['source'])
            sourceTroubles.writeheader()
            sourceTroubles.writerows(sourceTroublesData)
コード例 #24
0
def update_crsids(filename):
    for row in csvkit.DictReader(open(filename)):
        activity_id = row['openaid_id']
        new_crsid = row['CRSID-OK']
        initiative_number = row['Initiative number']
        # clean number
        if len(initiative_number.split(' ')) > 0:
            initiative_number = initiative_number.split(' ')[0]
        if len(initiative_number) > 0:
            initiative_number = initiative_number.zfill(6)
        project_number = '/'.join([initiative_number, row['projectnumber']])

        updates_markers = False

        try:
            activity = Activity.objects.get(pk=activity_id)
        except Activity.DoesNotExist:
            print '- Impossibile trovare Activity.pk = %s' % activity_id
            continue

        try:

            new_project = Project.objects.get(
                crsid=new_crsid, recipient__code=activity.recipient.code)

            try:
                conclict_activity = new_project.activity_set.get(
                    year=activity.year)

                if conclict_activity == activity:
                    continue

                _, updates_markers = conclict_activity.merge(activity,
                                                             save=False)
                activity, conclict_activity = conclict_activity, activity
                print '- Cancello %s dopo il merge in %s' % (
                    repr(conclict_activity), repr(activity))
                conclict_activity.delete()

            except Activity.DoesNotExist:
                pass

        except Project.DoesNotExist:

            new_project = Project.objects.create(
                crsid=new_crsid,
                recipient=activity.recipient,
                start_year=activity.year,
                end_year=activity.year,
                number=project_number,
            )

            print(
                '- Nuovo progetto per Activity %s non trovato con newCRSID:%s'
                % (repr(activity), new_crsid))

        finally:
            activity.crsid = new_crsid
            activity.project = new_project
            activity.number = project_number
            if updates_markers:
                activity.markers.save()
            if project_number:
                activity.number = project_number
                new_project.number = project_number
                try:
                    initiative = Initiative.objects.get(
                        code=new_project.number.split('/')[0])
                    new_project.initiative = initiative
                except Initiative.DoesNotExist:
                    print '- Nessuna Initiative trovata con codice: %s' % (
                        project_number)
            activity.save()

            new_project.update_from_activities(save=True)

            #print '- %s aggiornata' % repr(activity)

    # cancello tutti i progetti senza Activity
    qs = Project.objects.annotate(activities=Count('activity')).filter(
        activities=0)
    print 'Cancello %s Project senza Activity' % (qs.count(), )
    qs.delete()
コード例 #25
0
# cursor to mysql

conn = sqlite3.connect("../../sqlite_data/RICardo.sqlite")
c = conn.cursor()
# nb_flows by source slug
c.execute(
    """ SELECT source,count(*) as nb_flows FROM flows group by source UNION SELECT source, count(*) as nb_flows from exchange_rates group by source"""
)
nb_flows_by_sources = dict(r for r in c)

slugs = {}
# open source_types
with open('source_types.csv', 'r') as stfile:
    # create source_types index
    source_types = csvkit.DictReader(stfile)
    source_types = dict((st['acronym'], st) for st in source_types)
    # open sources
    with open('sources.csv', 'r') as sfile:
        sources = list(csvkit.DictReader(sfile))
        # join sources and source_types
        for s in sources:
            # keep fields
            for field in FIELDSTOCOPY:
                s[field] = source_types[s['acronym']][field]
            # filter out fields
            for field in FIELDSTODISCARD:
                del (s[field])
            # isolate author_editor
            if s['author'] == s['author_editor']:
                s['author'] = None
コード例 #26
0
ファイル: lookup.py プロジェクト: palewire/everytractcount
import os
import csvkit
import collections


COUNTIES_PATH = os.path.join(
    os.path.dirname(__file__),
    'data',
    '2017_Gaz_counties_national.txt'
)
COUNTIES_LIST = csvkit.DictReader(open(COUNTIES_PATH, 'r'), delimiter="\t", encoding="latin-1")
COUNTIES_DICT = collections.defaultdict(dict)
for row in COUNTIES_LIST:
    COUNTIES_DICT[row['USPS']][row['NAME']] = row['GEOID']


def county(usps, name):
    return COUNTIES_DICT[usps][name]
コード例 #27
0
nonLetters = re.compile(r'\W', re.UNICODE)


def slugify(source):
    slug = lambda s: ''.join(
        [re.sub(nonLetters, '', w).capitalize() for w in s.split(' ')])
    fields = [
        'author', 'name', 'country', 'volume_date', 'volume_number', 'pages'
    ]
    return '_'.join(
        slug(source[f]) for f in fields if source[f] and slug(source[f]))


# read 'new_sources.csv'
with open('new_sources.csv', 'r') as f:
    new_sources = list(csvkit.DictReader(f))

    swapSources = {}
    toDeleteSourcesSlugs = []

    # refaire tourner les slugs
    sources = []

    for source in new_sources:

        source['new_slug'] = slugify(source)
        # create swap source slug dictionnary to update flow and currency later based on to be removed column
        swapSources[source['slug']] = slugify(source)
        # remove uneeded lines
        if source['put x to remove'] == '':
            sources.append(source)
コード例 #28
0
    "pays", "value", "quantit", "origine", "total", "quantity_unit",
    "leurvaleursubtotal_1", "leurvaleursubtotal_2", "leurvaleursubtotal_3",
    "prix_unitaire", "probleme", "remarks"
]
headers = []

for (dirpath, dirnames, filenames) in os.walk(directory):
    if not sum(dirpath == os.path.join(directory, b) for b in black_list):
        for csv_file_name in filenames:
            ext = csv_file_name.split(
                ".")[-1] if "." in csv_file_name else None
            if ext == "csv":
                print "%s in %s" % (csv_file_name, dirpath)
                with open(os.path.join(dirpath, csv_file_name),
                          "r") as source_file:
                    r = csvkit.DictReader(source_file)
                    headers += r.fieldnames
                    lines = list(r)
                    sources_aggregation += lines
sources_aggregation = sorted(
    sources_aggregation,
    key=lambda e: (e["sourcetype"], e["year"], e["direction"]
                   if "direction" in e else "", e["exportsimports"]
                   if "exportsimports" in e else "", e["numrodeligne"]
                   if ("numrodeligne" in e and e["numrodeligne"]) else "", e[
                       "marchandises"], e["pays"] if "pays" in e else ""))

# Cleaning sources
for row in sources_aggregation:
    for k in row:
        row[k] = clean(row[k])
コード例 #29
0
import os
import json

import requests
import bs4
import csvkit

base_url = 'http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/'
# extract T-cell states
metapath = 'external_static/metadata/epigenome_roadmap/chromatin_state_samples_meta.csv'
assert os.path.isfile(metapath)

states = []
with open(metapath) as f:
    for x in csvkit.DictReader(f):
        if x['group'] == 'Blood & T-cell':
            states.append(x['eid'])

entries = []
html = bs4.BeautifulSoup(requests.get(base_url).content, 'html5lib')
for x in html.find_all('a'):
    if x.attrs['href'].endswith('narrowPeak.gz'):
        filename = x.attrs['href']
        eid = filename.split('-')[0]
        if eid in states:
            entries.append(
                dict(url=requests.compat.urljoin(base_url, filename),
                     filepath=filename))

print(json.dumps(entries, sort_keys=True, indent=4))
コード例 #30
0
ファイル: frostline.py プロジェクト: sdtiffany/frostline
    # See if the zip database table already exists.
    cursor.execute(
        "SELECT 1 FROM sqlite_master WHERE type='table' AND name='zip'")
    exists = cursor.fetchone()

    # If the database table doesn't exist, create it.
    if exists is None:
        cursor.execute(
            "CREATE TABLE zip(zipcode TEXT PRIMARY KEY NOT NULL, " +
            "zone TEXT, temperatures TEXT, city TEXT, state TEXT, latitude INTEGER, longitude INTEGER)"
        )
        db.commit()

        # Import the CSV file into the database
        with open('zipcodes.csv', 'rb') as zips:
            dr = csvkit.DictReader(zips)
            to_db = [(i['zipcode'], i['city'], i['state'], i['latitude'],
                      i['longitude']) for i in dr]
        cursor.executemany(
            "INSERT INTO zip (zipcode, city, state, latitude, longitude) VALUES (?, ?, ?, ?, ?);",
            to_db)
        db.commit()

    # Now load our climate data.
    zone_files = [1, 2, 3, 4]
    for zone_file in zone_files:
        with open(str(zone_file) + '.csv', 'rb') as zips:
            dr = csvkit.DictReader(zips)
            to_db = [(i['zone'], i['trange'], i['zipcode']) for i in dr]
        cursor.executemany(
            "UPDATE zip SET zone=?, temperatures=? WHERE zipcode=?;", to_db)