def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse :ptype csv_file: str :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return a batch of records for at most one organization :rtype: dict mapping at most one org-id to at most BATCH_SIZE (dict) records """ dataset_types = get_dataset_types(target_dataset) # Use JSON schema to discover the dataset type to which the file corresponds schema_tables = dict(( t, dict((f['label'], f['datastore_id']) for f in get_table(t)['fields'])) for t in dataset_types) records = {} schema_cols = None cols = None csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path))) if os.path.islink(csv_path): csv_path = os.readlink(csv_path) with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames for k, v in schema_tables.iteritems(): if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and len(cols) == len(v.keys()) + 2): # columns represent all schema data fields + 'Org id', 'Org' schema_cols = [v[col] if col in v else col for col in cols] break assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format( csv_path, dataset_types) with open(csv_path) as f: # use new dict, each col named for its corresponding JSON datastore_id csv_in = DictReader(f, fieldnames=schema_cols) csv_in.next() # skip header row: no new info for row_dict in csv_in: org_id = row_dict.pop('Org id') org = row_dict.pop('Org') if org_id not in records: if len(records.keys()): org_id_done = records.keys()[0] yield {org_id_done: records.pop(org_id_done)} records[org_id] = [] row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records[org_id].append(row_dict) if len(records[org_id]) >= BATCH_SIZE: yield {org_id: records.pop(org_id)} yield records
def get_csv_log_reader(csv_logs): if PY2: csv_stream = BytesIO(csv_logs) bom = csv_stream.read(len(BOM_UTF8)) assert bom == BOM_UTF8, "Unexpected Procmon csv encoding" csv_reader = DictReader(csv_stream, encoding='utf-8') else: csv_stream = StringIO(csv_logs.decode('utf-8-sig')) csv_reader = DictReader(csv_stream) return csv_reader
def get_csv_log_reader(csv_logs): if PY2: csv_logs_utf8 = csv_logs.encode( 'utf-8') # I only found a csv library that works for UTF-8 csv_stream = BytesIO(csv_logs_utf8) csv_reader = DictReader(csv_stream, encoding='utf-8') else: csv_stream = StringIO(csv_logs) csv_reader = DictReader(csv_stream) return csv_reader
def get_log_readers(csv_logs, pml_logs): pml_stream = BytesIO(pml_logs) pml_reader = ProcmonLogsReader(pml_stream) if PY2: csv_logs_utf8 = csv_logs.encode( 'utf-8') # I only found a csv library that works for UTF-8 csv_stream = BytesIO(csv_logs_utf8) csv_reader = DictReader(csv_stream, encoding='utf-8') else: csv_stream = StringIO(csv_logs) csv_reader = DictReader(csv_stream) return csv_reader, pml_reader
def read_csv_logs(csv_path): if PY2: with io.open(csv_path, "rb") as f: bom = f.read(len(BOM_UTF8)) assert bom == BOM_UTF8, "Unexpected Procmon csv encoding" csv_reader = DictReader(f, encoding='utf-8') for _ in csv_reader: pass else: with open(csv_path, "r", encoding="utf-8-sig") as f: csv_reader = DictReader(f) for _ in csv_reader: pass
def load_public_schools(self): from unicodecsv import DictReader import dateutil.parser from ambry.util import lowercase_dict table_name = 'public_schools' p = self.partitions.new_partition(table=table_name) url = self.metadata.build.public_schools.url self.log("Dowloading {}".format(url)) file_name = self.filesystem.download(url) self.log("Dowloading {} to {}".format(url, file_name)) with open(file_name) as f: dr = DictReader(f, delimiter='\t', encoding='latin1') try: p.query("DELETE FROM {}".format(table_name)) except: pass lr = self.init_log_rate(5000,table_name) with p.database.inserter(table_name, update_size=True) as ins: for i, row in enumerate(dr): row = lowercase_dict(row) row['id'] = None lr() ins.insert(row) return True
def _iter_csv(self, fp_raw): r = DictReader(fp_raw, delimiter=str(";"), encoding="cp1251") mapping = { "Найменування": 'name', "Скорочена назва": 'short_name', "Код ЄДРПОУ": 'edrpou', "Місцезнаходження": 'location', "ПІБ керівника": 'head', "Основний вид діяльності": 'company_profile', "Стан": 'status', } for i, chunk in enumerate(r): company = {} for k, v in chunk.items(): if k.strip(): if mapping[k] == "edrpou" and v: company[mapping[k]] = int(v) else: company[mapping[k]] = v company['founders'] = [] company["last_update"] = self.timestamp company["file_revision"] = self.revision if i and i % 50000 == 0: logger.warning('Read {} companies from CSV feed'.format(i)) yield company
def get_all_officer_fingerprints(): officer_csv_path = os.path.join(DATA_PATH, 'npo/npo_officers.csv') # check cache cache_path = _get_cache_path(officer_csv_path) if os.path.exists(cache_path): with open(cache_path) as f: return FingerprintStorage.from_dict(json.loads(f.read())) fingerprints = FingerprintStorage() total = sum(1 for line in open(officer_csv_path)) sys.stderr.write("\nMaking officer fingerprints...\n") with open(officer_csv_path) as f: reader = DictReader(f) for i, data in enumerate(reader): officer_id = data['officer_id'].strip() officer_name = data['officer_name'].strip() if not (officer_id and officer_name): continue fingerprints.put(officer_id, make_fingerprint(officer_name)) sys.stderr.write("\r%d of %d" % (i + 1, total)) sys.stderr.flush() # write to cache with open(cache_path, 'w') as f: f.write(json.dumps(fingerprints.to_dict())) sys.stderr.write("\nDone\n") return fingerprints
def main(): prs = argparse.ArgumentParser() prs.add_argument('--count', type=int, default=100) prs.add_argument('file', type=file) args = prs.parse_args() count = args.count assert count > 0 path = os.path.abspath(args.file.name) root, ext = os.path.splitext(path) new_path = '%s_trimmed_%s%s' % (root, count, ext) reader = DictReader(open(path)) new_entries = [] for i in range(count): new_entries.append(next(reader)) with open(new_path, 'w') as new_file: writer = DictWriter(new_file, reader.unicode_fieldnames) writer.writeheader() writer.writerows(new_entries) print open(new_path).read()
def validate_geodataset_upload(uploaded_file): """Validate an uploaded file containing geodataset data Because we're using exclusively `TemporaryFileUploadHandler`s we'll always have a local file we can open and inspect. """ # Check the extension. We do this instead of calling the # FileExtensionValidator because we don't want a bad extension to go any # further in this validator. extension = os.path.splitext(uploaded_file.name)[-1].lower() if extension != '.csv': raise ValidationError( 'Improper file extension "{}". You must upload a CSV'.format( extension)) # Validate the file by opening and inspecting it. with open(uploaded_file.temporary_file_path(), 'rb') as file_obj: # Start the reader. Handle bad CSVs try: reader = DictReader(file_obj, encoding='utf-8-sig') except csv.Error: raise ValidationError( 'Error processing file. File may not be a valid CSV.') if reader.fieldnames[0] != 'ocd_id': raise ValidationError('First column must be named \'ocd_id\'') seen = [] for i, fieldname in enumerate(reader.fieldnames): clean_field = slugify_header(fieldname) if clean_field == '': raise ValidationError( u'Column {} header is empty or decodes to empty'.format(i)) if clean_field in seen: raise ValidationError( u'One or more duplicate headers. {}'.format(clean_field)) seen.append(clean_field) ocd_ids = [] for row in reader: ocd_ids.append(row['ocd_id']) if not ocd_ids: raise ValidationError('File must have at least one entry.') # Get all the OCD IDs that match in the database, turn it into a list # we can compare to. db_ocd_ids = LegislativeDistrict.objects.filter( ocd_id__in=list(set(ocd_ids))).values_list('ocd_id', flat=True) # Iterate through user-provided OCD IDs and see if they're in the list # that came from the database. If they're not, return an error on the # first instance. for i, ocd_id in enumerate(ocd_ids): if ocd_id not in db_ocd_ids: raise ValidationError(u'One or more OCD IDs not found. First ' 'found: Row: {row} ID: {id}'.format( row=(i + 2), id=ocd_id))
def gdocs_persons(): resp = requests.get(PERSONS_CSV_URL, stream=True) resp.raise_for_status() reader = DictReader(resp.raw) for data in reader: if not data['Full Name']: continue yield data
def read_csv(self, csv_url): try: res = requests.get(csv_url, stream=True) res.raise_for_status() except requests.exceptions.RequestException as exc: log.error('Failed to open CSV [%s]: %s', csv_url, exc) return if res.encoding is None: res.encoding = 'utf-8' for row in DictReader(res.iter_lines(decode_unicode=True)): yield row
def crawl(self): logging.warn('starting asx crawl') res = requests.get(CSV_URL) header, body = res.content.split('\r\n\r\n', 1) sio = StringIO(body) logging.warn('about to start processing asx') for row in list(DictReader(sio)): row['source_info'] = header.strip() try: self.scrape_company(row) except Exception, e: log.exception(e)
def load_countries(apps, schema_editor): country = apps.get_model("core", "Country") with open("core/dicts/countries.csv", "r") as fp: r = DictReader(fp) for l in r: country.objects.update_or_create(pk=l["Code"], iso2=l["Alpha 2"], iso3=l["Alpha 3"], name_ua=l["UA"], name_en=l["UK"])
def scrape_csv(data): _, local_file = mkstemp() urllib.urlretrieve(data.get('source_url'), local_file) print 'CSV: %(source_url)s' % data rows = [] with open(local_file, 'rn') as fh: for row in DictReader(fh): row.update(data) # row['person_id'] = row.pop('id', None) # pprint(row) rows.append(row) return rows
def simplerun(fl, stopwords=stopwords): results = {} fl = open(fl) rd = DictReader(fl, encoding='utf-8') result = [row for row in rd if row['AU'] != ''] for item in result: item['TI'] = item['TI'].lower().replace('(book)', '') ks = classify(result, end=2020) f = freqdst(ks, stopwords=stopwords, leaveout=['book']) for k in ks: tf = termfreq(f, k) results[k] = tf return results
def iter_dataset(self, fp, filetype): if filetype == "json": for l in json.load(fp): yield l elif filetype == "jsonlines": for l in fp: yield json.loads(l) elif filetype == "csv": r = DictReader(fp) for l in r: yield l
def _build_templates(self): lc = LocalCKAN() output_files = {} next_row = {} output_counter = {} output_path = self.args[2:][-1] table = get_table(DATASET_TYPE) def close_write_file(org_id): book = output_files[org_id] if not book: return book.save(os.path.join(output_path, org_id + '-' + str(output_counter[org_id]) + '.xls')) output_files[org_id] = None def out_file(org_id): if org_id in output_files: next_row[org_id] += 1 # need to start a new file? if next_row[org_id] > SPLIT_XLS_ROWS: close_write_file(org_id) else: return output_files[org_id], next_row[org_id] try: org = lc.action.organization_show(id=org_id, include_datasets=False) except NotFound: print 'org id', org_id, 'not found' output_files[org_id] = None next_row[org_id] = 0 return None, None book = xls_template(DATASET_TYPE, org) output_files[org_id] = book output_counter[org_id] = output_counter.get(org_id, 0) + 1 next_row[org_id] = len(book.get_sheet(0).get_rows()) return book, next_row[org_id] def add_row(book, row, d): sheet = book.get_sheet(0) for i, f in enumerate(table['fields']): sheet.write(row, i, d[f['datastore_id']]) for f in self.args[1:-1]: for d in DictReader(open(f, 'rb')): book, row = out_file(d['organization']) if not book: continue add_row(book, row, d) for org_id in output_files: close_write_file(org_id)
def iter_game(game): """Iterates through all phases of a game.""" # raise appropriate exception if gamestate is empty if stat(game + ".gamestate").st_size == 0: raise ValueError("Game {} has empty gamestate, probably doesn't start " "at the beginning.".format(game)) if exists(game + ".press.tagged"): press_file = game + ".press.tagged" else: press_file = game + ".press" with open(game + ".gamestate", "rb") as f: game_state = list(DictReader(f)) with open(press_file, "rb") as f: press = list(DictReader(f, encoding="latin1")) with open(game + ".results", "rb") as f: order_results = list(DictReader(f)) nested_state = nested_by_phase(game_state, return_keys=False) nested_press = nested_by_phase(press, return_keys=False) nested_results, years, seasons, types = nested_by_phase(order_results) for year in years: for season in seasons: for phase_type in types: state = nested_state[year][season][phase_type] press = nested_press[year][season][phase_type] results = nested_results[year][season][phase_type] if any(len(k) for k in (state, press, results)): state = nested_state[year][season][phase_type] press = nested_press[year][season][phase_type] results = nested_results[year][season][phase_type] yield (year, season, phase_type, state, press, results)
def __init__(self, fname): self.all = [] self.full = {} self.groups = [] self.lt2opencorpora = {} with open(fname, "r") as fp: r = DictReader(fp) for tag in r: # lemma form column represents set of tags that wordform should # have to be threatened as lemma. tag["lemma form"] = [ _f for _f in map(str.strip, tag["lemma form"].split(",")) if _f ] tag["divide by"] = [ _f for _f in map(str.strip, tag["divide by"].split(",")) if _f ] # opencopropra tags column maps LT tags to OpenCorpora tags # when possible tag["opencorpora tags"] = (tag["opencorpora tags"] or tag["name"]) # Helper mapping self.lt2opencorpora[tag["name"]] = tag["opencorpora tags"] # Parent column links tag to it's group tag. # For example parent tag for noun is POST tag # Parent for m (masculine) is gndr (gender group) if not hasattr(self, tag["parent"]): setattr(self, tag["parent"], []) attr = getattr(self, tag["parent"]) attr.append(tag["name"]) # aux is our auxiliary tag to connect our group tags if tag["parent"] != "aux": self.all.append(tag["name"]) # We are storing order of groups that appears here to later # sort tags by their groups during export if tag["parent"] not in self.groups: self.groups.append(tag["parent"]) self.full[tag["name"]] = tag
def as_table(file, limit=None): try: sio = StringIO(file.data) reader = DictReader(sio) data = {'headers': None, 'rows': [], 'total': 0} for i, row in enumerate(reader): if data['headers'] is None: data['headers'] = row.keys() if limit is None or i < limit: rd = [row.get(k) for k in data['headers']] data['rows'].append(rd) data['total'] = i return data except CSVError, e: return {'status': 'error', 'error': unicode(e)}
def csv(fh): """Read a CSV file and return an iterator of normalised rows.""" for row in DictReader(fh): data = {} for k, v in row.items(): key = slugify(k, sep='_') if key is None: continue v = v.strip() if not len(v): v = None if key in data: log.warning("Duplicate column: %s", key) data[key] = v yield data
def mappings_import(file): """Load decided mappings from a CSV file.""" for row in DictReader(file): left_uid = row.get('left') right_uid = row.get('right') judgement = parse_boolean(row.get('judgement'), default=None) score = None if judgement is None: left = Entity.get(left_uid) right = Entity.get(right_uid) score = left.compare(right) project.emit_judgement(left_uid, right_uid, judgement, score=score, decided=True)
def scrape_csv(context, data): period = data.get("period") country = data.get("country") legislature = data.get("legislature") start_year = int(period.get('start_date')[:4]) current_year = datetime.utcnow().year # Don't import the US 2nd continental congress (TM): if current_year - 10 > start_year: return res = context.http.get(period.get('csv_url')) with open(res.file_path, 'rb') as csvfile: for row in DictReader(csvfile): context.emit(data={ "country": country, "legislature": legislature, "row": row })
def simplecloud(fl, stopwords=stopwords): wordclouds = {} fl = open(fl) rd = DictReader(fl, encoding='utf-8') result = [row for row in rd if row['AU'] != ''] for item in result: item['TI'] = item['TI'].lower().replace('(book)', '') ks = classify(result, end=2020) f = freqdst(ks, stopwords=stopwords) for k in ks: wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf', relative_scaling=1.0, stopwords=stopwords) tf = termfreq(f, k) wordcloud.generate_from_frequencies(tf) wordclouds[k] = wordcloud return wordclouds
def rows(self): try: with open(self.file_name, 'r') as fh: sample = fh.read(4096 * 10) encoding = guess_encoding(sample) if encoding != 'utf-8': log.info("Decode [%s]: %s", self.file_name, encoding) sample = sample.decode(encoding, 'replace') dialect = Sniffer().sniff(sample) fh.seek(0) for row in DictReader( fh, encoding=encoding, delimiter=dialect.delimiter.encode(encoding)): yield row except Exception as exc: log.error('Failed reading file [%s]: %s', self.file_name, exc)
def unified_foreign_registry_import(self, request): if request.method == "GET": return render( request, "admin/core/company/unified_import.html", {"form": ForeignImportForm()}, ) if request.method == "POST": form = ForeignImportForm(request.POST, request.FILES) if not form.is_valid(): return render(request, "admin/core/company/unified_import.html", {"form": form}) created_records = 0 updated_records = 0 r = DictReader(request.FILES["csv"]) importer = CompanyImporter(logger=MessagesLogger(request)) conn_importer = Company2CountryImporter( logger=MessagesLogger(request)) for entry in r: company, created = importer.get_or_create_from_unified_foreign_registry( entry) if not company: continue if created: created_records += 1 else: updated_records += 1 country_connection, _ = conn_importer.get_or_create( company, entry.get("country", "").strip(), "registered_in") self.message_user( request, "Створено %s компаній, оновлено %s" % (created_records, updated_records), ) return redirect(reverse("admin:core_company_changelist"))
def carregar_regioes(filename): from cidadeiluminada.base import db from cidadeiluminada.protocolos.models import Bairro, Regiao with open(filename, 'r') as csvfile: csvreader = DictReader(csvfile) for row in csvreader: regiao_ = row['regiao'] regiao = Regiao.query.filter_by(nome=regiao_).first() if not regiao: regiao = Regiao(nome=regiao) db.session.add(regiao) bairro_ = row['bairro'] bairro = Bairro.query.filter_by(nome=bairro_).first() if not bairro: bairro = Bairro(nome=bairro_) db.session.add(bairro) bairro.regiao = regiao db.session.commit()
def _get_csv_reader(self, *args, **kwargs): """Guess CSV dialect, and return CSV reader.""" # Skip the first line, as csv headers are more likely to have weird # character distributions than the actual data. self.csvfile.readline() # Read a significant chunk of the data to improve the odds of # determining the dialect. MCM is often run on very wide csv files. dialect = Sniffer().sniff(self.csvfile.read(16384)) self.csvfile.seek(0) if 'reader_type' not in kwargs: return DictReader(self.csvfile, errors='replace') else: reader_type = kwargs.get('reader_type') del kwargs['reader_type'] return reader_type(self.csvfile, dialect, **kwargs)
def import_aliases(project, author, path): """ Import aliases from a CSV file. This will not create new entities, but re-name existing entities or merge two entities if one's name is given as an alias for the other. """ with open(path, 'r') as fh: reader = DictReader(fh) for i, row in enumerate(reader): data = {} for k, v in row.items(): k = k.lower().strip() data[k] = v assert 'canonical' in data, 'No "canonical" column!' assert 'alias' in data, 'No "alias" column!' entities.apply_alias(project, author, data.get('canonical'), data.get('alias')) if i % 1000 == 0: db.session.commit() db.session.commit()
def run(self, filename, state): faker = Faker() with open(filename) as csvfile: # id, title, description, length, need_finance, # one_day, type, experience, attendees, size reader = DictReader(csvfile) count = 0 for row in reader: if Proposal.query.filter_by(title=row['title']).first(): continue user = User('*****@*****.**' % count, faker.name()) db.session.add(user) proposal = TalkProposal() if row['type'] == u'talk' else\ WorkshopProposal() if row['type'] == u'workshop' else\ InstallationProposal() proposal.state = state proposal.title = row['title'] proposal.description = row['description'] proposal.one_day = True if row.get('one_day') == 't' else False proposal.needs_money = True if row.get( 'need_finance') == 't' else False if row['type'] == 'talk': proposal.length = row['length'] elif row['type'] == 'workshop': proposal.length = row['length'] proposal.attendees = row['attendees'] else: proposal.size = row['size'] proposal.user = user db.session.add(proposal) db.session.commit() count += 1 app.logger.info('Imported %s proposals' % count)
"screen_name", "sid", "statuses_count", "text", "time_zone", "uid", "user.name", "utc_offset", "verified", "trainingLabel", ] html_parser = HTMLParser.HTMLParser() with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle: mturk_labeled_data_reader = DictReader( mturk_labeled_file_handle, fieldnames=header, encoding='utf-8') # skip first mturk_labeled_data_reader.next() # Dictionary to count flags flag_count_on_tweets = {} for hit in mturk_labeled_data_reader: if hit["AssignmentStatus"] != "Approved": continue tweet_id = hit['Input.id'] answer = hit['Answer.Q3Answer'] if tweet_id not in flag_count_on_tweets: flag_count_on_tweets[tweet_id] = 0 if answer != 'N/A': flag_count_on_tweets[tweet_id] += 1 counter = {0: 0, 1: 0, 2: 0, 3: 0} with codecs.open(line_separated_tweets_json_file_name, 'r', 'utf8') as line_separated_tweets_handle: