def getReader(uploadfile): file = uploadfile.file extension = os.path.splitext(uploadfile.filename)[1] csvreader = None # make sure to convert excel files if extension == '.xls': file = StringIO.StringIO(xls2csv(file)) csvreader = reader(file) else: dialect = sniffer.sniff_dialect(file.read(4096)) file.seek(0) csvreader = reader(file, dialect=dialect) return csvreader
def build_post_level_report(self, f): PERMALINK = 1 MESSAGE = 2 POST_DATE = 6 TOTAL_REACH = 7 IMPRESSIONS = 10 post_data = list() csv = csvkit.reader(f) csv.next() csv.next() for idx, r in enumerate(csv): post_data.append( { "permalink": r[PERMALINK], "pubdate": datetime.datetime.strptime(r[POST_DATE], "%m/%d/%Y %H:%M:%S %p"), "message": r[MESSAGE], "totalreach": safe_cast(r[TOTAL_REACH], int, 0), "impressions": safe_cast(r[IMPRESSIONS], int, 0), } ) start_date = min(post_data, key=lambda i: i["pubdate"])["pubdate"] end_date = max(post_data, key=lambda i: i["pubdate"])["pubdate"] sorted_data = sorted(post_data, key=lambda i: i["totalreach"], reverse=True) top_posts = sorted_data[:10] sorted_data.reverse() bottom_posts = sorted_data[:10] return {"start_date": start_date, "end_date": end_date, "top_posts": top_posts, "bottom_posts": bottom_posts}
def importaCandidatos(arquivo): print "Importando candidatos" raw = open(arquivo, 'r') raw = csvkit.reader(raw, encoding='iso-8859-1', delimiter=';') candidatos = {} lista = {} for c in raw: c[10] = unidecode.unidecode(c[10]) if c[9] in ['GOVERNADOR', 'PRESIDENTE', 'DEPUTADO FEDERAL', 'SENADOR']: candidatos[c[26]] = { 'nome' : c[10], 'apelidos' : [c[13]], '_id' : c[26], 'candidaturas' : {}, 'mugshot' : c[11] } candidatos[c[26]]['candidaturas']['2014'] = { 'cargo' : c[9], 'situacao' : c[15], 'numero' : c[12], 'partido' : c[17], 'uf' : c[5], 'doacoes' : {}, 'total' : 0, } # Salva lista if c[9] not in ['REMOVER']: lista[c[10]] = 0 if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos lista[c[13]] = c[10] mongo_save(candidatos, 'politicos', True) with open('names.js', 'w') as final: header ="var nick = " final.write(header+json.dumps(lista))
def from_csv(cls, path, column_info, header=True, **kwargs): """ Create a new table for a CSV. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param path: Path to the CSV file to read from. :param column_info: See :class:`.Table` constructor. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ with open(path) as f: rows = list(csv.reader(f, **kwargs)) if header: column_names = rows.pop(0) if len(column_names) != len(column_info): # TKTK Better Error raise ValueError return Table(rows, column_info)
def from_csv(cls, path, column_info=None, row_names=None, header=True, **kwargs): """ Create a new table for a CSV. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param path: Filepath or file-like object from which to read CSV data. :param column_info: May be any valid input to :meth:`Table.__init__` or an instance of :class:`.TypeTester`. Or, None, in which case a generic :class:`.TypeTester` will be created. :param row_names: See :meth:`Table.__init__`. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ if column_info is None: column_info = TypeTester() use_inference = isinstance(column_info, TypeTester) if hasattr(path, 'read'): rows = list(csv.reader(path, **kwargs)) else: with open(path) as f: rows = list(csv.reader(f, **kwargs)) if header: column_names = rows.pop(0) else: column_names = [None] * len(rows[0]) if use_inference: column_info = column_info.run(rows, column_names) else: if len(column_names) != len(column_info): # TKTK Better Error raise ValueError('CSV contains more columns than were specified.') return Table(rows, column_info, row_names=row_names)
def analyze_fb_page_data(page_file): print("Analying Facebook Page file {0}".format(page_file)) DATE = 0 TOTAL_LIKES = 1 ENGAGED_USERS = 6 TOTAL_REACH = 26 IMPRESSIONS = 35 data = list() with open(page_file) as f: csv = csvkit.reader(f) # skip first 2 lines csv.next() csv.next() for idx, r in enumerate(csv): data.append({ 'date': datetime.datetime.strptime(r[DATE], "%Y-%m-%d"), 'likes': safe_cast(r[TOTAL_LIKES], int, 0), 'engaged_users': safe_cast(r[ENGAGED_USERS], int, 0), 'reach': safe_cast(r[TOTAL_REACH], int, 0), 'impressions': safe_cast(r[IMPRESSIONS], int, 0) }) start_date = min(data, key=lambda i: i['date'])['date'] end_date = max(data, key=lambda i: i['date'])['date'] print "Start Date: ", start_date print "End Date: ", end_date chart_labels = [datetime.datetime.strftime(x['date'], "%m/%d/%Y") for x in data] reach = [x['reach'] for x in data] impressions = [x['impressions'] for x in data] engaged = [x['engaged_users'] for x in data] likes = [x['likes'] for x in data] reach.insert(0, 'Reach') impressions.insert(0, 'Impressions') engaged.insert(0, 'Engaged Users') likes.insert(0, 'Page Likes') template = render_template(FB_PAGE_TMPL, {'start_date': start_date, 'end_date': end_date, 'chart_labels': chart_labels, 'reach': reach, 'impressions': impressions, 'engaged': engaged, 'likes': likes}) template = template.encode('utf-8') print("Generating FB Page Report {0}".format(FB_PAGE_REPORT_OUT_NAME)) with open(FB_PAGE_REPORT_OUT_NAME, "wb") as outf: outf.write(template)
def processFile(column, filename): dataValues = [] # the data with open(filename, 'r') as csvData: csvReader = csvkit.reader(csvData, delimiter=',', quotechar='"', skipinitialspace=True) for row in csvReader: dataValues.append(int(row[column])) dataSeries = pd.Series(dataValues) dataDescription = dataSeries.describe() descriptonStr = [str(e) for e in dataDescription.tolist()] descriptonStr.insert(0, filename) print ", ". join(descriptonStr)
def from_csv(cls, path, column_info, header=True, **kwargs): """ Create a new table for a CSV. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param path: Filepath or file-like object from which to read CSV data. :param column_info: A sequence of pairs of column names and types. The latter must be instances of :class:`.DataType`. Or, an instance of :class:`.TypeTester` to infer types. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ use_inference = isinstance(column_info, TypeTester) if use_inference and not header: raise ValueError("Can not apply TypeTester to a CSV without headers.") if hasattr(path, "read"): rows = list(csv.reader(path, **kwargs)) else: with open(path) as f: rows = list(csv.reader(f, **kwargs)) if header: column_names = rows.pop(0) if use_inference: column_info = column_info.run(rows, column_names) else: if len(column_names) != len(column_info): # TKTK Better Error raise ValueError("CSV contains more columns than were specified.") return Table(rows, column_info)
def test_writer_alias(self): output = six.StringIO() writer = csvkit.writer(output) writer.writerow(['a', 'b', 'c']) writer.writerow(['1', '2', '3']) writer.writerow(['4', '5', u'ʤ']) written = six.StringIO(output.getvalue()) reader = csvkit.reader(written) self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader), ['1', '2', '3']) self.assertEqual(next(reader), ['4', '5', u'ʤ'])
def test_writer_alias(self): output = six.StringIO() writer = csvkit.writer(output, encoding='utf-8') self.assertEqual(writer._eight_bit, True) writer.writerow(['a', 'b', 'c']) writer.writerow(['1', '2', '3']) writer.writerow(['4', '5', u'ʤ']) written = six.StringIO(output.getvalue()) reader = csvkit.reader(written, encoding='utf-8') self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader), ['1', '2', '3']) self.assertEqual(next(reader), ['4', '5', u'ʤ'])
def processFile(column, filename): dataValues = [] # the data with open(filename, 'r') as csvData: csvReader = csvkit.reader(csvData, delimiter=',', quotechar='"', skipinitialspace=True) for row in csvReader: dataValues.append(int(row[column])) dataSeries = pd.Series(dataValues) dataDescription = dataSeries.describe() descriptonStr = [str(e) for e in dataDescription.tolist()] descriptonStr.insert(0, filename) print ", ".join(descriptonStr)
def loadMasterCSVFile(): """ loads the durations of all eegs from an NK database derived csv file of eeg numbers with their associated start and end times. :return: """ with codecs.open(nkDurationFilename, 'r', encoding='utf-8', errors='ignore') as cf: i = 0 masterCSV = csv.reader(cf) masterCSV_list = [row for row in masterCSV] print(len(masterCSV_list)) return masterCSV_list
def tsvTOcsv(input_file_name, out_file): """ converts tsv formatted files to csv, used to make the csv file that is readable by this EEG report feature analyzer :param input_file_name: a tsv file :param out_file: a csv file :return: """ i = 0 with open(input_file_name, 'rb') as tsvin, open(out_file, 'wb') as csvout: tsvin = csv.reader(tsvin, delimiter='\t') csvout = csv.writer(csvout) for row in tsvin: if len(row) > 0: csvout.writerow(row)
def extract_data( ): ## Extracting form txt or csv. it returns "exp_V" and "exp_I" lists exp_V = [] exp_I = [] line_num = 0 run_path = os.getcwd() namefile = input("Enter the name of the file: ") typefile = input("Which type of data? [txt/csv]") header = int(input("How many lines need to be removed from the top? ")) # namefile = "test" #This block can be used to speed up during testing # typefile = "csv" # header = 0 os.chdir(data_path) if typefile == "csv": with open(namefile + ".csv") as file: rowreader = csvkit.reader(file, delimiter=";") for row in rowreader: line_num += 1 try: #Verify if the this can act as a text removal. exp_V.append(float(row[0])) exp_I.append(float(row[1])) except: print("Line " + str(line_num) + " contains NaNs") for i in (0, header): exp_V.pop(0) exp_I.pop(0) file.close() elif typefile == "txt": my_file = open(namefile + ".txt") data = my_file.read() my_file.close lines = data.split("\n") for line in lines: row = line.split("\t") line_num += 1 try: exp_V.append(float(row[0])) exp_I.append(float(row[1])) except: print("Line " + str(line_num) + " contains NaNs") # voltage = np.array([volt]) # current = np.array([curr]) # Might be necessary to perform a fitting. To be investigated. os.chdir(run_path) return exp_V, exp_I
def build_page_level_report(self, f): DATE = 0 TOTAL_LIKES = 1 ENGAGED_USERS = 6 TOTAL_REACH = 26 IMPRESSIONS = 35 data = list() csv = csvkit.reader(f) csv.next() csv.next() for idx, r in enumerate(csv): data.append( { "date": datetime.datetime.strptime(r[DATE], "%Y-%m-%d"), "likes": safe_cast(r[TOTAL_LIKES], int, 0), "engaged_users": safe_cast(r[ENGAGED_USERS], int, 0), "reach": safe_cast(r[TOTAL_REACH], int, 0), "impressions": safe_cast(r[IMPRESSIONS], int, 0), } ) start_date = min(data, key=lambda i: i["date"])["date"] end_date = max(data, key=lambda i: i["date"])["date"] chart_labels = [datetime.datetime.strftime(x["date"], "%m/%d/%Y") for x in data] reach = [x["reach"] for x in data] impressions = [x["impressions"] for x in data] engaged = [x["engaged_users"] for x in data] likes = [x["likes"] for x in data] reach.insert(0, "Reach") impressions.insert(0, "Impressions") engaged.insert(0, "Engaged Users") likes.insert(0, "Page Likes") return { "start_date": start_date, "end_date": end_date, "chart_labels": chart_labels, "reach": reach, "impressions": impressions, "engaged_users": engaged, "likes": likes, }
def generateTSECand(): '''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta. http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip ''' lista = {} ufs = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO","BR"] for uf in ufs: print 'Getting '+uf cand = open("../raw/candidaturas2014/consulta_cand_2014_"+uf+".txt", 'r') cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';') for c in cand: #if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas #if c[9] not in ['REMOVER']: # lista[unidecode.unidecode(c[10])] = 0 if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos lista[unidecode.unidecode(c[10])] = 0 lista[c[13]] = unidecode.unidecode(c[10]) return lista
def handle(self, *args, **options): # read in CSV print("This is an auto-generated Django model module \ created by apps.core.commands.") print("from django.contrib.gis.db import models\n") with open(args[0], 'rb') as csvfile: reader = csvkit.reader(csvfile) headers = reader.next() print("class GeneratedModel(models.Model):") for row in headers: # take the row, slugify it # and replace the hyphens with underscores field = slugify(row).replace('-', '_') print(" %s = models.CharField(max_length=255)" % field) print("\n")
def analyze_fb_post_data(post_file): PERMALINK = 1 MESSAGE = 2 POST_DATE = 6 TOTAL_REACH = 7 IMPRESSIONS = 10 print("Analying Facebook Post file {0}".format(post_file)) post_data = list() with open(post_file) as f: csv = csvkit.reader(f) # skip first 2 lines csv.next() csv.next() for idx, r in enumerate(csv): post_data.append({ 'permalink': r[PERMALINK], 'pubdate': datetime.datetime.strptime(r[POST_DATE], "%m/%d/%Y %H:%M:%S %p"), 'message': r[MESSAGE], 'totalreach': int(r[TOTAL_REACH]), 'impressions': int(r[IMPRESSIONS])}) start_date = min(post_data, key=lambda i: i['pubdate'])['pubdate'] end_date = max(post_data, key=lambda i: i['pubdate'])['pubdate'] sorted_data = sorted(post_data, key=lambda i: i['totalreach'], reverse=True) template = render_template(FB_POST_TMPL, {'top_posts': sorted_data[:10], 'start_date': start_date, 'end_date': end_date}) template = template.encode('utf-8') print("Generating FB Post Report {0}".format(FB_POST_REPORT_OUT_NAME)) with open(FB_POST_REPORT_OUT_NAME, "wb") as outf: outf.write(template) return 0
def generateCand(): '''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta. http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip ''' lista = {} ufs = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO","BR"] for uf in ufs: print 'Getting '+uf cand = open("../raw/consulta_cand_2014_"+uf+".txt", 'r') cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';') for c in cand: #if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas if c[9] not in ['DEPUTADO ESTADUAL']: lista[c[10]] = 0 if c[9] in ['GOVERNADOR', 'PRESIDENTE']: #adiciona tambem o nome de urna nesses casos lista[c[13]] = c[10] with open('names.js', 'w') as final: header ="var nick = " final.write(header+json.dumps(lista))
def parse_downloaded_file(self, file_path): print "Parse %s" % file_path with open(file_path,'rb') as f: content = f.read().replace("\r\n","\n").replace("\xef\xbb\xbf","") all_rows = list(csv.reader(StringIO(content), delimiter=";")) if len(all_rows) == 0: print "%s is empty." % file_path raise EmptyFileError("In file {}".format(file_path)) title_row = all_rows[2] title = title_row[1] param_rows = [x for x in all_rows[-6:] if len(x) > 1] param_headers = [x[0].replace(":","").strip() for x in param_rows] param_values = [x[1].strip() for x in param_rows] data_rows = all_rows[3:-7] data_headers = [x.replace("\n","") for x in data_rows[0]] data_headers[0] = "Region" headers = param_headers + data_headers for row in data_rows[1:]: values = param_values + row datapoint = dict(zip(headers, values)) # AMS changed their format slightly in 2017-05 # This is a hack to get the old "Utrikesfödda" # in the same way as before # Current format seems faulty. if u"Utrikesfödda" not in datapoint.keys(): if u"utrikesfödda" in title: datapoint[u"Utrikesfödda"] = "Ja" else: datapoint[u"Utrikesfödda"] = "" self.append(datapoint) return self
def from_csv(cls, path, column_info, header=True, **kwargs): """ Create a new table for a CSV. Will use csvkit if it is available, otherwise will use Python's builtin csv module. ``args`` and ``kwargs`` will be passed through to :meth:`csv.reader`. Note: if using Python 2 and not using csvkit, this method is not unicode-safe. :param path: Path to the CSV file to read from. :param column_info: See :class:`.Table` constructor. """ with open(path) as f: rows = list(csv.reader(f, **kwargs)) if header: column_names = rows.pop(0) if len(column_names) != len(column_info): # TKTK Better Error raise ValueError return Table(rows, column_info)
def generateTSECand(): '''Gera names.json a partir dos arquivos de candidatura de 2014 na pasta. http://agencia.tse.jus.br/estatistica/sead/odsele/consulta_cand/consulta_cand_2014.zip ''' lista = {} ufs = [ "AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO", "BR" ] for uf in ufs: print 'Getting ' + uf cand = open( "../raw/candidaturas2014/consulta_cand_2014_" + uf + ".txt", 'r') cand = csvkit.reader(cand, encoding='iso-8859-1', delimiter=';') for c in cand: #if c[15] == 'DEFERIDO': #muitas candidaturas ainda nao foram deferidas #if c[9] not in ['REMOVER']: # lista[unidecode.unidecode(c[10])] = 0 if c[9] in ['GOVERNADOR', 'PRESIDENTE' ]: #adiciona tambem o nome de urna nesses casos lista[unidecode.unidecode(c[10])] = 0 lista[c[13]] = unidecode.unidecode(c[10]) return lista
return [item for sublist in l for item in sublist] def parseTournament(tournamentId, playerList): final_data = [ parsePlayer(tournamentId, playerId) for playerId in playerList ] return [item for sublist in final_data for item in sublist] #finalData = parseTournament(309, [66,686]) # load in our data def readTournamentPlayers(row): r = [int(r) if r.isdigit() else r for r in row] playerList = [id for id in r[3:] if id != ''] tournamentId = r[2] return {'tournamentId': tournamentId, 'playerList': playerList} with open('tournaments.csv') as f: reader = csv.reader(f) reader.next() #toss the header row tournaments = [readTournamentPlayers(row) for row in reader] # print parsePlayer(142, 15) # print parsePlayer(309, 686) [parseTournament(**tournament) for tournament in tournaments]
def handle(self, *args, **options): """ Make it happen. """ super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.keep_file = options["keep_file"] # get model based on strings of app_name and model_name self.model = apps.get_model(options["app_name"], options['model_name']) # load from provided csv or csv mapped to model self.csv = options["csv"] or self.model.objects.get_csv_path() # load into database suggested for model by router self.database = router.db_for_write(model=self.model) # get most recently cleaned RawDataFile try: raw_file = RawDataFile.objects.filter( file_name=self.model._meta.db_table, clean_start_datetime__isnull=False ).latest('clean_start_datetime') except RawDataFile.DoesNotExist: raise CommandError( 'No record of cleaning {0}.TSV (run `python manage.py ' 'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table) ) # raise exception if clean step did not finish if not raw_file.clean_finish_datetime: raise CommandError( 'Previous cleaning of {0}.TSV did not finish (run `python manage.py ' 'cleancalaccessrawfile {0}`).'.format(self.model._meta.db_table) ) # Get the row count from the source CSV with open(self.csv, 'r') as infile: self.csv_row_count = max(sum(1 for line in infile) - 1, 0) # Quit if the CSV is empty. if not self.csv_row_count: if self.verbosity > 2: self.failure("{} is empty.".format(self.csv)) return # Get the headers from the source CSV with open(self.csv, 'r') as infile: csv_reader = reader(infile) self.csv_headers = next(csv_reader) # store the start time for the load raw_file.load_start_datetime = now() # reset the finish time for the load raw_file.load_finish_datetime = None # save here in case command doesn't finish raw_file.save() # Load table if self.verbosity > 2: self.log(" Loading {}".format(options['model_name'])) self.load() # add load counts to raw_file_record raw_file.load_columns_count = len(self.model._meta.fields) raw_file.load_records_count = self.model.objects.count() # Log an error if the counts don't match if self.verbosity > 2 and raw_file.load_records_count != self.csv_row_count: msg = " Table record count doesn't match CSV. {} in the table vs. {} in the CSV." self.failure(msg.format(raw_file.load_records_count, self.csv_row_count)) # if not keeping files, remove the csv file if not self.keep_file: os.remove(self.csv) # store the finish time for the load raw_file.load_finish_datetime = now() # and save the RawDataFile raw_file.save()
}) # let's be polite time.sleep(1) print(r.url) s = Soup(r.text, 'html.parser') l = [parseTable(r, playerId, tournamentId) for r in s.find_all(class_='scorecard-table')] return [item for sublist in l for item in sublist] def parseTournament(tournamentId, playerList): final_data = [parsePlayer(tournamentId, playerId) for playerId in playerList] return [item for sublist in final_data for item in sublist] #finalData = parseTournament(309, [66,686]) # load in our data def readTournamentPlayers(row): r = [int(r) if r.isdigit() else r for r in row] playerList = [id for id in r[3:] if id != ''] tournamentId = r[2] return { 'tournamentId' : tournamentId, 'playerList' : playerList } with open('tournaments.csv') as f: reader = csv.reader(f) reader.next() #toss the header row tournaments = [readTournamentPlayers(row) for row in reader] # print parsePlayer(142, 15) # print parsePlayer(309, 686) [parseTournament(**tournament) for tournament in tournaments]
def test_reader_alias(self): with open('examples/test_utf8.csv') as f: reader = csvkit.reader(f, encoding='utf-8') self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader), ['1', '2', '3']) self.assertEqual(next(reader), ['4', '5', u'ʤ'])
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = reader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = next(rows) if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def test_reader_alias(self): with open('examples/test_utf8.csv', encoding='utf-8') as f: reader = csvkit.reader(f) self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader), ['1', '2', '3']) self.assertEqual(next(reader), ['4', '5', u'ʤ'])
import csvkit, sys from collections import defaultdict writer = csvkit.writer(sys.stdout) with open(sys.argv[1]) as csv_file: for i, row in enumerate(csvkit.reader(csv_file)): if i == 0: col_count = len(row) - 1 freqs = [defaultdict(int) for col in range(col_count)] continue for col in range(col_count): freqs[col][int(row[col + 1])] += 1 values = sum((freqs[col].keys() for col in range(col_count)), []) for val in sorted(set(values)): val_freqs = [freqs[col][val] for col in range(col_count)] row = [val] + val_freqs writer.writerow(row)
import requests import csvkit import os import json import io #fetch files with open('./out6_file3_address_3_clean.csv', 'rb') as f: reader = csvkit.reader(f) your_list = list(reader) print your_list[0][0] #geocode results = [] for i, val in enumerate(your_list): address = [val][0][0] params= {'text':address} url = 'http://localhost:3100/v1/search?' r = requests.get(url + 'text=' + address) rjson = r.json()['features'][0] rjson['properties']['query'] = address results.append(rjson) with open('./out6_file3_address_3_clean.json', 'wb') as fd: fd.write(json.dumps(results)) ~ #from postal.parser import parse_address #parse_address('The Book Club 100-106 Leonard St Shoreditch London EC2A 4RH, United Kingdom')
def handle(self, *args, **options): verbosity = options['verbosity'] if verbosity == '0': self.logger.setLevel(logging.ERROR) elif verbosity == '1': self.logger.setLevel(logging.WARNING) elif verbosity == '2': self.logger.setLevel(logging.INFO) elif verbosity == '3': self.logger.setLevel(logging.DEBUG) csvfile = options['csv_file'] encoding = options['encoding'] csv_out = out = options['out'] if type(out) == str: csv_out = open(out, 'wb') writer = csv.writer(csv_out, delimiter=';', quotechar='"', encoding=encoding) writer.writerow(['slug', 'url', 'attivo', 'tema', 'natura', 'cup', 'programma', 'classificazione_qsn', 'fondo_comunitario', 'fin_totale_pubblico', 'fin_totale_pubblico_netto', 'pagamento', 'stato_progetto','stato_finanziamenti']) locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8') with open(csvfile, 'rb') as cfile: reader = csv.reader(cfile, delimiter=',', quotechar='"') for r in reader: slug = None url = '-' output_r = r if not r: continue url = r[0].strip() slug_search = re.search( '^(http://){0,1}(www\.){0,1}opencoesione.gov.it/progetti/(' '.*?)/?$', url, re.IGNORECASE ) if slug_search: slug = slug_search.group(3) if slug and '/' not in slug: output_r = [slug, r[0]] try: p = Progetto.fullobjects.get(slug=slug) is_active = p.active_flag tema = p.tema.tema_superiore.short_label natura = p.classificazione_azione.classificazione_superiore\ .short_label cup = p.cup programma = ','.join([f.descrizione for f in p.fonti_fin]) class_qsn = p.classificazione_qsn.classificazione_superiore.classificazione_superiore.descrizione fondo_com = p.get_fondo_comunitario_display() fin_tot = locale.currency(p.fin_totale_pubblico).replace('Eu', u'€') fin_tot_netto = locale.currency(p.fin_totale_pubblico_netto).replace('Eu', u'€') pagamento = locale.currency(p.pagamento).replace('Eu', u'€') stato_fin = p.get_stato_finanziario_display() stato_prog = p.get_stato_progetto_display() output_r.extend([is_active, tema, natura, cup, programma, class_qsn, fondo_com, fin_tot, fin_tot_netto, pagamento, stato_fin, stato_prog]) except ObjectDoesNotExist: pass self.logger.info(r[0]) writer.writerow(output_r)
#!/usr/bin/env python # Remove newline chars from CSV "cells" # Input is taken from stdin and output spit to stdout import csvkit import sys reader = csvkit.reader(sys.stdin) writer = csvkit.writer(sys.stdout) for row in reader: for i in range(0, len(row)): if isinstance(row[i], str): if "\n" in row[i]: row[i] = row[i].replace("\n", '') writer.writerow(row)
#! /usr/bin/env python # from http://unix.stackexchange.com/questions/60590/is-there-a-command-line-utility-to-transpose-a-csv-file import csvkit as csv, sys rows = list(csv.reader(sys.stdin)) writer = csv.writer(sys.stdout) for col in xrange(0, len(rows[0])): writer.writerow([row[col] for row in rows])
#coding: utf8 import mysql.connector import config import csvkit dbcon = mysql.connector.connect(database=config.db, user=config.user, password=config.passwd, host=config.host) dbcur = dbcon.cursor() sql1 = "drop table if exists rating;" dbcur.execute(sql1); print "table削除" sql2 = "create table rating (userID text, placeID int, rating int, food_rating int, service_rating int);" dbcur.execute(sql2); print "table作成" csv_data = csvkit.reader(file('/Users/K/dropbox/RCdata/rating_final.csv')) for row in csv_data: sql = "INSERT INTO `rating`(`userID`, `placeID`, `rating`, `food_rating`, `service_rating`) VALUES (%s,%s,%s,%s,%s)" dbcur.execute(sql, row) #実際にMySQLに反映させる dbcon.commit() dbcur.close() dbcon.close()
import codecs, csvkit DATA = '../data/' with open(DATA+'explicacoes.csv', 'rb') as csvfile: arquivo = csvkit.reader(csvfile, delimiter=',', encoding='utf-8') explicacoes = [] for linha in arquivo: explicacao = { '_id' : linha[0], 'sigla' : linha[0], 'nome' : linha[1], 'descricao' : linha[2].strip() } explicacoes.append(explicacao) def mongo_save(explicacoes, clear=False): from pymongo import MongoClient client = MongoClient() db = client.monitorlegislativo collection = db.explicacoes if (clear): collection.drop() for e in explicacoes: collection.update({'_id' : e['_id']}, e, upsert=True) mongo_save(explicacoes)
def handle(self, *args, **options): """ Make it happen. """ super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.keep_file = options["keep_file"] # get model based on strings of app_name and model_name self.model = apps.get_model(options["app_name"], options['model_name']) # load from provided csv or csv mapped to model self.csv = options["csv"] or self.model.objects.get_csv_path() # load into database suggested for model by router self.database = router.db_for_write(model=self.model) # get most recently cleaned RawDataFile try: raw_file = RawDataFile.objects.filter( file_name=self.model._meta.db_table, clean_start_datetime__isnull=False).latest( 'clean_start_datetime') except RawDataFile.DoesNotExist: raise CommandError( 'No record of cleaning {0}.TSV (run `python manage.py ' 'cleancalaccessrawfile {0}`).'.format( self.model._meta.db_table)) # raise exception if clean step did not finish if not raw_file.clean_finish_datetime: raise CommandError( 'Previous cleaning of {0}.TSV did not finish (run `python manage.py ' 'cleancalaccessrawfile {0}`).'.format( self.model._meta.db_table)) # Get the row count from the source CSV with open(self.csv, 'r') as infile: self.csv_row_count = max(sum(1 for line in infile) - 1, 0) # Quit if the CSV is empty. if not self.csv_row_count: if self.verbosity > 2: self.failure("{} is empty.".format(self.csv)) return # Get the headers from the source CSV with open(self.csv, 'r') as infile: csv_reader = reader(infile) self.csv_headers = next(csv_reader) # store the start time for the load raw_file.load_start_datetime = now() # reset the finish time for the load raw_file.load_finish_datetime = None # save here in case command doesn't finish raw_file.save() # Load table if self.verbosity > 2: self.log(" Loading {}".format(options['model_name'])) self.load() # add load counts to raw_file_record raw_file.load_columns_count = len(self.model._meta.fields) raw_file.load_records_count = self.model.objects.count() # Log an error if the counts don't match if self.verbosity > 2 and raw_file.load_records_count != self.csv_row_count: msg = " Table record count doesn't match CSV. {} in the table vs. {} in the CSV." self.failure( msg.format(raw_file.load_records_count, self.csv_row_count)) # if not keeping files, remove the csv file if not self.keep_file: os.remove(self.csv) # store the finish time for the load raw_file.load_finish_datetime = now() # and save the RawDataFile raw_file.save()
#!/usr/bin/env python # Remove newline chars from CSV "cells" # Input is taken from stdin and output spit to stdout import csvkit import sys reader = csvkit.reader(sys.stdin) writer = csvkit.writer(sys.stdout) for row in reader: for i in range(0, len(row)): if isinstance(row[i], (str, unicode)): if "\n" in row[i]: row[i] = row[i].replace("\n", '') writer.writerow(row)
def openFile(): with open('./out6_file3_address_3_clean.csv', 'rb') as f: reader = csvkit.reader(f) your_list = list(reader) print your_list[0][0]