def handle_noargs(self, **options): global YEAR, COMMIT YEAR = options['year'] COMMIT = options['commit'] if not options['candidates'] or not os.path.exists(options['candidates']): print >> sys.stderr, "The candidates file doesn't exist" sys.exit(1) if not YEAR: print >> sys.stderr, "You must specify a year" sys.exit(1) #check all the parties exist with open(options['candidates'], 'rb') as csvfile: candidiates = unicodecsv.reader(csvfile) missingparties = False lastmissingparty = '' for row in candidiates: if not get_party(row[0]): if row[0] != lastmissingparty: print 'Missing party:', row[0] lastmissingparty = row[0] missingparties = True if missingparties: sys.exit(1) #check whether the positions exist, otherwise create them check_or_create_positions() with open(options['candidates'], 'rb') as csvfile: candidiates = unicodecsv.reader(csvfile) for row in candidiates: if not search(row[3], row[4], row[0], row[2], row[1]): add_new_person(row[0], row[2], row[1], row[3], row[4])
def _get_py(self, key): if isinstance(key, tuple): assert len(key) == 2 result = self._get_py(key[0]) if isinstance(key[1], list): getter = itemgetter(*key[1]) else: getter = itemgetter(key[1]) if isinstance(key[0], (list, slice)): return map(getter, result) else: return getter(result) f = self.open(self.path) if self.header: next(f) if isinstance(key, compatibility._inttypes): line = nth(key, f) result = next(csv.reader([line], **self.dialect)) elif isinstance(key, list): lines = nth_list(key, f) result = csv.reader(lines, **self.dialect) elif isinstance(key, slice): start, stop, step = key.start, key.stop, key.step result = csv.reader(it.islice(f, start, stop, step), **self.dialect) else: raise IndexError("key '%r' is not valid" % key) try: if not isinstance(result, Iterator): f.close() except AttributeError: pass return result
def open_file(infile, informat='raw', encoding="utf-8", **kwargs): logger.debug('Opening file: {}'.format(infile)) if isinstance(infile, basestring): if informat == "vnd.ms-excel" or informat == 'xls': import xlrd logger.debug('An office file!') f = xlrd.open_workbook(infile, on_demand=True) elif informat == "xml": logger.debug('An XML file!') f = etree.parse(infile) elif informat == "csv": logger.debug('Opening as csv') f = csv.reader(open(infile, 'r'), encoding=encoding, **kwargs) else: f = codecs.open(infile, 'r', encoding) else: if informat == "vnd.ms-excel" or informat == 'xls': import xlrd logger.debug('An office file!') f = xlrd.open_workbook(file_contents=infile.read(), on_demand=True) elif informat == "xml": logger.debug('An XML file!') f = etree.fromstring(infile) elif informat == "csv": logger.debug("CSV file") f = csv.reader(infile, encoding=encoding, **kwargs) else: f = codecs.iterdecode(iter(infile.readline, ""), encoding) return f
def __init__(self,file=None): filesniff = open(file) try: dialect = unicodecsv.Sniffer().sniff(filesniff.read(1024)) wb = unicodecsv.reader(open(file),dialect,encoding='utf-8') except Exception: wb = unicodecsv.reader(open(file),delimiter=',',encoding='utf-8') self.wb = wb reader = wb rows = [] columns = [] # # for rownum in range(sh1.nrows): # sh1.nrows -> number of rows (ncols -> num columns) # rows.append(sh1.row_values(rownum)) for row in reader: rows.append(row) print rows columns = self.columnsExtract(rows) print columns res = Generator().main(rows=rows,columns=columns) self.res = res
def getBarChartData(): #First function to get Bar Chart Data f_artists = open('artists.csv') #Opens artists.csv for editing and makes a variable out of it. f_albums = open('albums.csv') #Opens albums.csv for editing and makes a variable out of it. artists_rows = csv.reader(f_artists) #Creates rows in artists.csv albums_rows = csv.reader(f_albums) #Creates rows in albums.csv artists_header = artists_rows.next() #Creates header in artists.csv albums_header = albums_rows.next() #Creates header in albums.csv artist_names = [] # New list in which to store artist names. decades = range(1900,2020, 10) #For upcoming decade dictionary, sets limits and intervals for how it will store dates by decade. decade_dict = {} #Creates decae dictionary. for decade in decades: #Sets conditions on what happens to this dictionary. decade_dict[decade] = 0 #Sets initial value to 0 for artist_row in artists_rows: #Sets conditions on rows in artists.csv if not artist_row: #condition: skip if it does not correspond to a row in artists.csv continue artist_id,name,followers, popularity = artist_row # set following values to go in entries to rows section of arists.csv artist_names.append(name) #puts all artist names in the list for album_row in albums_rows: #sets conditions on rows in albums.csv if not album_row: #condition: skip if it does not correspond to a row in albums.csv continue artist_id, album_id, album_name, year, popularity = album_row #sets following values to go in entries to rows section of albums.csv for decade in decades: #conditions on decades it assigns to album if (int(year) >= int(decade)) and (int(year) < (int(decade) + 10)): #takes year in album and counts the decade in which the year belongs decade_dict[decade] += 1 break #ends the condition on the album row x_values = decades #for upcoming chart, sets decades as the x value y_values = [decade_dict[d] for d in decades] #for upcoming chart, sets number of albums as y value return x_values, y_values, artist_names #brings all of the values generated in the function for the upcoming barchart function
def processData(): global manualIgnoreRecords global yesIgnoreRecords global manualProcessedRecords global yesProcessedRecords dirpath = parentdir + "/R3_profiles_YNNM_raw/" with open(dirpath + 'MANUAL_RAW.csv', 'r') as infile, open(processeddir + 'MANUAL_PROCESSED.csv', 'ab') as outfile: rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8') writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8') for row in rows: if(row[6] in manual_ignore_list): #Ignore it manualIgnoreRecords += 1 continue else: manualProcessedRecords += 1 writer.writerow(row) with open(dirpath + 'YES_RAW.csv', 'r') as infile, open(processeddir + 'YES_PROCESSED.csv', 'ab') as outfile: rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8') writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8') for row in rows: if(row[6] in yes_ignore_list): #Ignore it yesIgnoreRecords += 1 continue else: yesProcessedRecords writer.writerow(row)
def _get_headers(self, resource): """ Get CSV file headers from the provided resource. """ # If the resource is a file we just open it up with the csv # reader (after being sure we're reading from the beginning # of the file if type(resource) == file: resource.seek(0) reader = csv.reader(resource) # If the resource is a basestring it is either a url or a file # location, so similarly to the specification mechanism we either # access it with an HTTP get request or by opening the file. elif isinstance(resource, basestring): result = six.moves.urllib.parse.urlparse(resource) if result.scheme in ['http', 'https']: with closing(requests.get(resource, stream=True)) as response: # Headers are alway the first row of a CSV file # so it's enought to just get the first line and # hopefully save bandwidth header_row = response.iter_lines().next() else: # It may seem weird to open up a csv file, read its header row # and then StringIO that into a new csv reader but this file # we want to close and we want the same interface for all with open(resource) as resource_file: reader = csv.reader(resource_file) header_row = reader.next() reader = csv.reader(cStringIO.StringIO(header_row)) else: raise IOError('Resource type not supported') return reader.next()
def output_lfc_diff_using_csv(path_to_csv_earlier, path_to_csv_later, output_path): fleet_time_a = {} fleet_time_b = {} fleet_diff = {} with open(path_to_csv_earlier, 'rUb') as csvfile: reader = csv.reader(csvfile, errors='ignore') for row in reader: if len(row) >= 2: fleet_time_a[row[0].strip()] = long(row[1].strip()) with open(path_to_csv_later, 'rUb') as csvfile: reader = csv.reader(csvfile, errors='ignore') for row in reader: if len(row) >= 2: fleet_time_b[row[0].strip()] = long(row[1].strip()) for account_name in fleet_time_b.keys(): if account_name in fleet_time_a: lfc_diff = fleet_time_b[account_name] - fleet_time_a[account_name] else: lfc_diff = fleet_time_b[account_name] if lfc_diff > 0: fleet_diff[account_name] = lfc_diff fleet_diff_output = fleet_diff.items() fleet_diff_output.sort(key=lambda account_tuple: account_tuple[1], reverse=True) with open(output_path, 'wb') as csvfile: cwriter = csv.writer(csvfile) for account_tuple in fleet_diff_output: cwriter.writerow(account_tuple)
def validate_and_return_rows(csv_file_form=None, csv_file=None, has_header_row=True, required_fields=[]): """ Opens a CSV file and optionally checks for required fields in the header. Returns the rows of the CSV and a list of the headers as a tuple: (csv_rows, headers) """ if csv_file_form: csv_file = csv_file_form.cleaned_data['file'] has_header_row = csv_file_form.cleaned_data['has_header_row'] if not csv_file: raise Exception("Pass in CsvFileForm instance or csv_file=request.FILES.get('file')") column_headers = [] if has_header_row: # Read and store the header row column names r = unicodecsv.reader(csv_file.read().splitlines(), encoding='utf-8') row = r.next() for column_header in row: column_headers.append(column_header) # Check for required fields (optional check) for field in required_fields: if field not in column_headers: raise Exception("Invalid CSV file. Must contain %s." % field) csv_file.seek(0) return (unicodecsv.reader(csv_file.read().splitlines(), encoding='utf-8'), column_headers)
def patchsql(sys_args): parser = argparse.ArgumentParser(description='Patch a database.') parser.add_argument('url', help='Sqlalchemy-compatible database url') parser.add_argument('--patch', nargs=1, required=False, default=None, help="A csv file describing the patch. In the " "format output by daff.") parser.add_argument('--follow', nargs=2, required=False, default=None, help="An alternative to --patch option. Specify" "two csv files to compare, and patch from their diff.") parser.add_argument('--table', nargs=1, required=True, default=None, help='Table to which patch should be applied.') parser.add_argument('--safe-null', required=False, action='store_true', help='Decode nulls in a reversible way.') parser.add_argument('--quiet', required=False, action='store_true', help='Do not show computed diff.') args = parser.parse_args(sys_args) url = args.url tables = args.table db = SqlAlchemyDatabase(url) st = daff.SqlTable(db, daff.SqlTableName(tables[0])) patch = None if args.patch: with open(args.patch[0], 'rt') as fin: patch = list(csv.reader(fin)) patch = daff.Coopy.tablify(patch) if args.follow: with open(args.follow[0], 'rt') as fin: table0 = list(csv.reader(fin)) fix_nulls(table0, args.safe_null) with open(args.follow[1], 'rt') as fin: table1 = list(csv.reader(fin)) fix_nulls(table1, args.safe_null) patch = daff.Coopy.diff(table0, table1) ansi_patch = daff.Coopy.diffAsAnsi(table0, table1) if not args.quiet: print(ansi_patch, file=sys.stderr, end='') if not patch: raise KeyError('please specify either --patch or --follow') daff_patch = daff.HighlightPatch(st, patch) daff_patch.apply() if db.events['skips'] != 0: print(" * {}".format(json.dumps(db.events), file=sys.stderr))
def compare_csvs(self, csv1, csv2): """Compara dos csvs a ver si son iguales.""" with open(csv1, "rb") as csvfile1, open(csv2, "rb") as csvfile2: reader1 = unicodecsv.reader(csvfile1, delimiter=str(","), quotechar=str('"')) reader2 = unicodecsv.reader(csvfile2, delimiter=str(","), quotechar=str('"')) rows1 = set((",".join(row) for row in reader1)) rows2 = set((",".join(row) for row in reader2)) self.assertEqual(len(rows1), len(rows2)) self.assertEqual(rows1, rows2)
def main(args): finished = defaultdict(int) input_lines = [] skipped = defaultdict(int) written = defaultdict(int) # load input data with args.input_file as fh: csvread = csv.reader(fh, delimiter=str(args.input_csv_delim), quotechar=b'"', encoding="UTF-8") columns = DataLine.get_columns_from_header(csvread.next()) for row in csvread: input_lines.append(DataLine.from_csv_line(row, columns)) # load all results files provided for finished_file in args.finished_files: with finished_file as fh: csvread = csv.reader(fh, delimiter=str(args.finished_csv_delim), quotechar=b'"', encoding="UTF-8") header = csvread.next(); columns = DataLine.get_columns_from_header(header) try: judgment_column = header.index('check_result') except ValueError: judgment_column = None for row in csvread: # treat rejected as unfinished if judgment_column is not None and row[judgment_column].startswith('N'): continue # keep track of how many judgments are finished in the results finished_line = DataLine.from_csv_line(row, columns) finished[finished_line.signature] += 1 print >> sys.stderr, "Loaded input: %d, Loaded finished: %d" % (len(input_lines), len(finished)) with sys.stdout as fh: # starting with the header csvwrite = csv.writer(fh, delimiter=b"\t", lineterminator="\n", encoding="UTF-8") csvwrite.writerow(DataLine.get_headers()) # write rows requiring different number of judgments, starting from the most judgments for judg_req in xrange(args.num_judgments, 0, -1): csvwrite.writerow(("# Requiring %d judgments" % judg_req,)) for line in input_lines: if finished[line.signature] != args.num_judgments - judg_req: skipped[judg_req] += 1 continue csvwrite.writerow(line.as_tuple()) written[judg_req] += 1 print >> sys.stderr, ("%d judgments -- written: %d" % (judg_req, written[judg_req])) print >> sys.stderr, "Skipped: %d" % (len(input_lines) - sum(written.values()))
def main(): #Connect to database file (Note: can also pass as a file using sys) CONN = sqlite3.connect('melon.db') #This cursor object passes commands from python and executes them in the sqlite3 melon.db DB = CONN.cursor() #Deletes tables if they exist. Good if I made a mistake creating them. DB.execute('''DROP TABLE IF EXISTS Customers;''') DB.execute('''DROP TABLE IF EXISTS Orders;''') #Create 2 tables of Customers & Orders DB.execute('''CREATE TABLE Customers (customer_id INTEGER PRIMARY KEY NOT NULL, first varchar(30), last varchar(30), email varchar(60), telephone varchar(30), called DATE);''') #Note: Foreign key & Reference needs to be stated at the end of the create table dialog. DB.execute('''CREATE TABLE Orders (order_id INTEGER PRIMARY KEY NOT NULL, order_date DATE,status varchar(30), customer_id INTEGER,email varchar(60),address varchar(30),city varchar(30),state varchar(30),postalcode varchar(30),num_watermelons INTEGER,num_othermelons INTEGER ,subtotal INTEGER ,tax INTEGER ,order_total INTEGER, FOREIGN KEY (customer_id) REFERENCES Customers(customer_id)); ''') #CSV reader reads each line and strips, splits each line into a list object. :) f1reader = unicodecsv.reader(open('customers.csv'), encoding='utf-8') f2reader = unicodecsv.reader(open('orders.csv'), encoding='utf-8') #Next skips collumn names // headers of csv file next(f1reader) next(f2reader) # Text formatting in file (nyee '~' & special charecters) cannot be interpreted by sqlite. Need to import unicode csv module to parse code. #Note: There are some shity looking customer files with strange charecters. What up with dat? Can they be flagged and edited later on? (eg., like editing student records) for row in f1reader: DB.executemany('''INSERT INTO customers (customer_id, first, last, email, telephone, called) VALUES(?, ?, ?, ?, ?, ?);''', (row, )) for row2 in f2reader: DB.executemany('''INSERT INTO orders (order_id, order_date, status, customer_id, email, address, city, state, postalcode, num_watermelons, num_othermelons, subtotal, tax, order_total) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);''', (row2, )) #Change empty string row values to Null. # DB.executemany('''UPDATE orders SET VALUES # ''') CONN.commit() CONN.close()
def read_files(qfile, qcatfile, catfile): """ read from .csv files qfile - .csv file containing the SMS Guru questions qcatfile - .csv file containing the relation between questions and category catfile - .csv file containing the categories """ with open(qfile, 'rb') as csvfile: question_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) with open(qcatfile, 'rb') as csvfile: question_category_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) with open(catfile, 'rb') as csvfile: category = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) return question_train, question_category_train, category
def read_files(qfile_train, qfile_test, catfile): """ read from .csv files qfile_train - .csv file containing the SMS Guru questions for the train set qfile_test - .csv file containing the SMS Guru questions for the test set catfile - .csv file containing the categories """ with open(qfile_train, 'rb') as csvfile: question_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) with open(qfile_test, 'rb') as csvfile: question_test = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) with open(catfile, 'rb') as csvfile: category = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8')) return question_train, question_test, category
def main(wordlist1, wordlist2, dist_funcs): with open(wordlist1, 'rb') as file_a, open(wordlist2, 'rb') as file_b: reader_a = csv.reader(file_a, encoding='utf-8') reader_b = csv.reader(file_b, encoding='utf-8') print('Reading word lists...') words = list(zip([(w, g) for (g, w) in reader_a], [(w, g) for (g, w) in reader_b])) words_a, words_b = list(zip(*[(a, b) for (a, b) in words if a and b])) print('Constructing cost matrix...') matrix = construct_cost_matrix(words_a, words_b, dist_funcs) m = munkres.Munkres() print('Computing matrix using Hungarian Algorithm...') indices = m.compute(matrix) print(score(indices)) print('Done.')
def process(self): report = self.receive_message() if report: event = Event() columns = ["__IGNORE__", "source_url", "description_url", "source_time", "__IGNORE__", "__IGNORE__", "__IGNORE__", "target"] for row in unicodecsv.reader(StringIO(report), encoding='utf-8'): if "phish_id" in row: continue for key, value in zip(columns, row): if key == "__IGNORE__": continue event.add(key, value.strip()) event.add('feed', 'phishtank') event.add('type', 'phishing') event = utils.parse_source_time(event, "source_time") event = utils.generate_observation_time(event, "observation_time") event = utils.generate_reported_fields(event) self.send_message(event) self.acknowledge_message()
def writeUniqueResults(clustered_dupes, input_file, output_file): # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. logging.info('saving unique results to: %s' % output_file) cluster_membership = {} for (cluster_id, cluster) in enumerate(clustered_dupes): for record_id in cluster: cluster_membership[record_id] = cluster_id unique_record_id = cluster_id + 1 writer = csv.writer(output_file) reader = csv.reader(StringIO(input_file)) heading_row = next(reader) heading_row.insert(0, u'Cluster ID') writer.writerow(heading_row) seen_clusters = set() for row_id, row in enumerate(reader): if row_id in cluster_membership: cluster_id = cluster_membership[row_id] if cluster_id not in seen_clusters: row.insert(0, cluster_id) writer.writerow(row) seen_clusters.add(cluster_id) else: cluster_id = unique_record_id unique_record_id += 1 row.insert(0, cluster_id) writer.writerow(row)
def from_csv(self,path=None,add=False): """Reads a previously constructed CITES<->NCBI CSV mapping database.""" # open the file and read the .csv logging.debug('Going to read csv file "%s"' % path) with open(path, 'rb') as csvfile: read = csv.reader(csvfile, encoding='utf-8', delimiter=',', quotechar='"') for line in read: # store the date if line[0] == u'Date': self.date = line[1] logging.debug('Stored date: "%s"' % self.date) # skip comment lines elif re.match('^#', line[0]): logging.debug('Skipping comment line: %s' % line) continue # ncbi taxid, name, description, ncbi name, CITES appendix else: taxid = line[0] name = line[1] desc = line[2] canon = line[3] app = line[4] self.taxa.append( Taxon( name=name, description=desc, appendix=app, ncbi={taxid:canon} ) ) logging.debug('Instantiated "%s" with {%s:%s}' % (name,taxid,canon))
def main(): rnd.seed(1206) ap = ArgumentParser() # TODO use more files ? ap.add_argument('-b', '--bootstrap-iters', type=int, default=1000) ap.add_argument('cf_output', type=str, help='crowdflower results file') args = ap.parse_args() votes = defaultdict(int) res = [] with open(args.cf_output, 'rb') as fh: csvread = csv.reader(fh, delimiter=b',', quotechar=b'"', encoding="UTF-8") headers = csvread.next() for row in csvread: row = Result(row, headers) if row._golden == 'true': # skip test questions continue if row.more_natural == 'A less than B': votes[row.origin_b] += 1 res.append(row.origin_b) elif row.more_natural == 'A more than B': votes[row.origin_a] += 1 res.append(row.origin_a) for key, val in votes.iteritems(): print '%s\t%d (%2.2f)' % (key, val, float(val) / len(res) * 100) pairwise_bootstrap(res, args.bootstrap_iters)
def test_csv_export(self): """Ensures exported CSV data matches source data""" qs_filter = { "pk__in": [x.pk for x in self.snapshots] } qs = BuildingSnapshot.objects.filter(**qs_filter) fields = list(_get_fields_from_queryset(qs)) fields.append("canonical_building__id") export_filename = export_csv(qs, fields) export_file = open(export_filename) reader = csv.reader(export_file) header = reader.next() self.assertEqual(header[len(fields)-1], 'ID') for i in range(len(self.snapshots)): row = reader.next() for j in range(len(fields)): field = fields[j] components = field.split("__") qs_val = qs[i] for component in components: qs_val = getattr(qs_val, component) if qs_val == None: break if isinstance(qs_val, Manager) or qs_val == None: qs_val = u'' else: qs_val = unicode(qs_val) csv_val = row[j] self.assertEqual(qs_val, csv_val) export_file.close() os.remove(export_filename)
def handle_label(self, label, **options): verbosity = int(options['verbosity']) if not os.path.exists(label): print >> sys.stderr, "The parties file doesn't exist", sys.exit(1) #get the party kind object partykind = OrganisationKind.objects.get(slug='party') #check each party by checking against slug with open(label, 'rb') as csvfile: parties = unicodecsv.reader(csvfile) for slug, name in parties: try: party = Organisation.objects.get(slug=slug) if party.name != name: if verbosity >= 1: print 'Updating party %s from %s to %s' % (slug, party.name, name) party.name = name party.save() except Organisation.DoesNotExist: #we need to add the party if verbosity >= 1: print 'Adding party %s' % name Organisation.objects.create( name = name, slug = slug, kind = partykind)
def read_db_csvfile( file_name, delim = "|" ): # read the file db_reader = unicodecsv.reader( open(file_name), delimiter=delim.encode('utf-8'), encoding='cp1252') transactions = [] mc_transactions = [] for row in db_reader: LOGGER.debug("row: %r", row) # skip empty rows if not row: continue if row[1].startswith("D\xe9compte\xa0des\xa0d\xe9penses:\xa0carte\xa0de\xa0cr\xe9dit "): LOGGER.debug('"Décompte\xa0des\xa0dépenses" line dropped.') else: transaction, is_mastercard_transaction = _process_db_csv_entry( row ) if is_mastercard_transaction: mc_transactions.append( _clean_transaction(transaction) ) #LOGGER.debug("mastercard transaction: %r", transaction) else: transactions.append( _clean_transaction(transaction) ) #LOGGER.debug("transaction: %r", transaction) #pprint(transaction) return transactions, mc_transactions
def create_fixtures(): loader = Loader('opennews', project_label='Open News', project_settings={}, source_url=DEFAULT_SOURCE_URL) import_schema(loader.project, StringIO(SCHEMATA)) reader = unicodecsv.reader(StringIO(DATA)) reader.next() for record in reader: fellow = loader.make_entity(['fellow']) fellow.set('name', record[0]) fellow.set('twitter_handle', record[1]) fellow.save() news_org = loader.make_entity(['news_organization']) news_org.set('name', record[4]) news_org.set('url', record[5]) news_org.save() fellowship = loader.make_relation('fellowship', fellow, news_org) fellowship.set('start_date', record[2]) fellowship.set('end_date', record[3]) fellowship.save() loader.persist()
def iter_rows(building_data): """ Opens the given file-like object as a CSV file delimited by pipes, and yields a list for each row containing the land plot number, building id, street address, post code, and latitude, and longitude. """ reader = unicodecsv.reader(building_data, delimiter="|", encoding="latin1") reader.next() # Skip header row. for row in reader: land_plot_number = int(row[3]) building_id = int(row[4]) street_address = u" ".join(row[5].split()) # Normalise whitespace. try: post_code = int(row[7]) except ValueError: post_code = None isn93_x = float(row[22].replace(",", ".")) isn93_y = float(row[23].replace(",", ".")) longitude, latitude = isnet93_to_wgs84(isn93_x, isn93_y) yield { "landnr": land_plot_number, "heitinr": building_id, "street": street_address, "postcode": post_code, "ll": (longitude, latitude), }
def process_casen(): f = open('casen_2014.csv', 'r') r = unicodecsv.reader(f, encoding='utf-8') comunas_names = r.next() datos_comuna = {} for i in range(3, len(comunas_names)): comuna = comunas_names[i].upper() result = scraperwiki.sqlite.select('id from data where muni="%s"' % (comuna, )) id_ = result[0].get('id') datos_comuna[i] = {'comuna_id': id_, 'comuna_name': comuna, 'dato': []} dato_counter = 0 for datos in r: dato_counter += 1 for j in range(3, len(comunas_names)): datos_comuna[j]['dato'].append({ 'id': dato_counter, 'dato_name': datos[1].strip(), 'value': datos[j] }) dato_counter = 0 for a in datos_comuna: for b in datos_comuna[a]['dato']: final = {'id': dato_counter, 'id_muni': datos_comuna[a]['comuna_id'], 'dato_name': b['dato_name'], 'value': b['value'] } dato_counter += 1 scraperwiki.sqlite.save(unique_keys=['id'], data=final, table_name='datos_comuna')
def read_tsv(cls, path, encoding="utf-8"): """Read a gene set database from a tab-delimited text file. Parameters ---------- path: str The path name of the the file. encoding: str The encoding of the text file. Returns ------- None """ gene_sets = [] n = 0 with open(path, "rb") as fh: reader = csv.reader(fh, dialect="excel-tab", encoding=encoding) for l in reader: n += 1 gs = GeneSet.from_list(l) gene_sets.append(gs) logger.debug("Read %d gene sets.", n) logger.debug("Size of gene set list: %d", len(gene_sets)) return cls(gene_sets)
def from_dump(self,path): """Reads a downloaded CITES database dump.""" # open the file and read the .csv logging.info('Reading the CITES data dump.') header = {} with open(path, 'rb') as csvfile: read = csv.reader(csvfile, delimiter=',', quotechar='"', encoding='utf-8') for line in read: if not header: header = line continue else: record = {} for idx, val in enumerate(header): record[header[idx]] = line[idx] if record[u'CitesAccepted'] == u'true': app = { u'I':1, u'II':2, u'III':3 } for i in record[u'CurrentListing'].split(u'/'): if i in app: taxon = Taxon( name=record[u'FullName'], description=record[u'AnnotationEnglish'], appendix=unicode(app[i]) ) self.taxa.append(taxon)
def __init__(self,filename,header=None,tabs=False,encoding="utf-8",logger=None): if logger is None: self.logger = global_logger else: self.logger = logger dialect = unicodecsv.excel if tabs: dialect = unicodecsv.excel_tab if not self.filename.endswith(".tsv"): self.filename = filename + ".tsv" else: if not self.filename.endswith(".csv"): self.filename = filename + ".csv" self.__filehandle = open(self.filename,"rb") self._reader = unicodecsv.reader(self.__filehandle, encoding=encoding, dialect=dialect) if header is None: self.header = {} h = self._reader.next() for i, f in enumerate(h): cn = get_canonical_name(f) if cn[0] is not None: self.header[i] = cn[0] else: self.header = header self.line_length = len(self.header)
def itervoters(self): if self.voter_file_content: voter_stream = StringIO.StringIO(self.voter_file_content) else: voter_stream = open(self.voter_file.path, "rU") #reader = unicode_csv_reader(voter_stream) reader = unicodecsv.reader(voter_stream, encoding='utf-8') for voter_fields in reader: # bad line if len(voter_fields) < 1: continue return_dict = {'voter_id': voter_fields[0]} if len(voter_fields) > 1: return_dict['email'] = voter_fields[1] if len(voter_fields) > 2: return_dict['name'] = voter_fields[2] if len(voter_fields) > 3: return_dict['group'] = voter_fields[3] yield return_dict
def itervoters(self): if self.voter_file_content: voter_stream = StringIO.StringIO(self.voter_file_content) else: voter_stream = open(self.voter_file.path, "rU") #reader = unicode_csv_reader(voter_stream) reader = unicodecsv.reader(voter_stream, encoding='utf-8') for voter_fields in reader: # bad line if len(voter_fields) < 1: continue return_dict = {'voter_id': voter_fields[0]} if len(voter_fields) > 1: return_dict['email'] = voter_fields[1] if len(voter_fields) > 2: return_dict['name'] = voter_fields[2] yield return_dict
def train_stats(f_name, eou='__eou__', eot='__eot__'): pos_utterances = [] pos_turns = [] pos_words = [] neg_utterances = [] neg_turns = [] neg_words = [] reader = unicodecsv.reader(open(f_name)) next(reader) # skip header for line in reader: if int(float(line[2])) == 1: pos_utterances.append(line[0].count(eou)) pos_turns.append(line[0].count(eot)) pos_words.append(len(line[0].split())) elif int(float(line[2])) == 0: neg_utterances.append(line[0].count(eou)) neg_turns.append(line[0].count(eot)) neg_words.append(len(line[0].split())) else: print line[2] return pos_utterances, pos_turns, pos_words, neg_utterances, neg_turns, neg_words
def post(self, request, *args, **kwargs): file = request.data.get(u'file', None) delimiter = str(request.data.get('delimiter', None)) encoding = request.data.get(u'encoding', None) if request.data.get(u'mapping'): mapping = json.loads(request.data.get(u'mapping')) else: mapping = dict() headers = csv.reader(file, encoding=encoding, delimiter=delimiter).next() reader = csv.DictReader(file, fieldnames=headers, encoding=encoding, delimiter=delimiter) result = [] for v in self.generate(reader, mapping): result.append(v) return response.Response(data=result, status=status.HTTP_200_OK)
def download_csv_filter_output(source, dataset_id, columns): """ Download CSV resource from dataset and print the columns """ ckan = ckanapi.RemoteCKAN(source) dataset = ckan.action.package_show(id=dataset_id) url = dataset['resources'][0]['url'] response = requests.get(url, stream=True) csv = unicodecsv.reader(response.iter_lines(), encoding='utf-8') # skip header row next(csv) for line in csv: out = {} for k, col_num in columns.iteritems(): out[k] = line[col_num] # skip blanks if not out['id']: continue if out['id'] in choices: sys.stderr.write('duplicate id: %r!\n' % out['id']) continue choices[out['id']] = {'en': out['en'], 'fr': out['fr']}
def read_twitter_csv(src): """ Reads Twitter dataset in .csv format Parameters ---------- src: str - path to csv file. Returns ------- dict. {tid: "text": ... } """ data = {} with open(src, "rb") as f: reader = csv.reader(f, delimiter="\t") for row in reader: tid, text = row data[tid] = text return data
def import_expenditure(): ''' Update the expenditure table deleting any data for the city/business_type ''' f = open('expenditure.csv') r = unicodecsv.reader(f, encoding='utf-8') headers = r.next() for row in r: row_dict = dict(zip(headers, row)) assert(row_dict['BUSINESS_TYPE'] in BUSINESS_TYPES) Expenditure.query.filter_by( city=row_dict['CITY'], type=row_dict['BUSINESS_TYPE'], ).delete() exp = Expenditure( city=row_dict['CITY'], type=row_dict['BUSINESS_TYPE'], spend=row_dict['SPEND_PER_CAPITA'], ) db.session.add(exp) db.session.commit() print 'Expenditure imported'
def load_model_item_mapping(item_csv_path=config.MATCHING_MODEL_DRESS_DETAILS_PATH): ''' This is a hack to map from the pre-sqlalchemy IDs to the primary key IDs in the database. At startup, we load a mapping between the old and new IDs and use it when querying for Items/ItemImages. ''' with open(item_csv_path, 'rb') as f: records = list(csv.reader(f)) header, records = records[0], records[1:] original_recs = [dict(zip(header, rec)) for rec in records] original_recs_by_url = {r['detail_url']: r for r in original_recs} item_mapping = {} session = models.Session() for item in session.query(models.Item).all(): mapped_item = original_recs_by_url.get(item.detail_url) if not mapped_item: print("NO MAPPING FOUND: {}".format(item.detail_url)) continue item_mapping[int(mapped_item['dress_id'])] = item.id return item_mapping
def restart_harvest(args): harvest = get_harvest(args) data_dir = os.path.join(os.getcwd(), 'data', harvest) meta = get_metadata(data_dir) if meta: try: with open(os.path.join(data_dir, 'results.csv'), 'rb') as csv_file: reader = csv.reader(csv_file, delimiter=',', encoding='utf-8') rows = list(reader) if len(rows) > 1: start = len(rows) - 2 # Remove the last row in the CSV just in case there was a problem rows = rows[:-1] with open(os.path.join(data_dir, 'results.csv'), 'wb') as csv_file: writer = csv.writer(csv_file, delimiter=',', encoding='utf-8') for row in rows: writer.writerow(row) else: start = 0 except IOError: # Nothing's been harvested start = 0 start_harvest(data_dir=data_dir, key=meta['key'], query=meta['query'], pdf=meta['pdf'], text=meta['text'], start=start, max=meta['max'])
def main(): with open('full_text_mapped.csv', 'rb') as f: r = csv.reader(f, delimiter='|', quotechar='@') data = list(r) results = [] results.append(data[0] + ['polarity', 'subjectivity', 'readability']) data = data[1:] # no header # parallel pool = multiprocessing.Pool(processes=4) results += pool.map(get_sentiment, data) #for x in data[0:3]: # results.append(get_sentiment(x)) with open('full_text_mapped_sentiment.csv', 'wb') as aa: writer = csv.writer(aa, delimiter='|', quotechar='@', quoting=csv.QUOTE_ALL) writer.writerows(results)
def readCsvAndCountPercentPerFormItemFromGoogleForms(fileName): times = {} totalRows = 0 with open(fileName, 'r') as csvfile: csvReader = csv.reader(csvfile, encoding='utf-8') csvReader.next() # skip the first line for row in csvReader: value = row[1] ''' Because I did some mistakes and fixed later on google forms I replace the 6 with 7 and 27 with 28 because I mistoon the dates of the sundays. ''' value = value.replace("6", "7").replace("27", "28") if (value in times.keys()): times[value] += 1 else: times[value] = 1 totalRows += 1 return calculateDictionaryAsPercent(times, totalRows)
def _dl_csv(refresh=False): if refresh or not hasattr(_dl_csv, 'projects'): response = requests.get(CSV_URL) response_text = requests.utils.get_unicode_from_response( response).encode('ascii', 'ignore') csvfile = csv.reader(response_text.splitlines(), encoding='utf-8') projects = {} # skip header rows csvfile.next() csvfile.next() for row in csvfile: metadata = { 'name': row[0], 'search_name': row[7] or row[0], # replacement name 'country': row[2], } for rulename, col in RULE_COLUMNS.items(): metadata[rulename] = row[col] == 'y' projects[row[0].lower()] = metadata _dl_csv.projects = projects else: print('getting from cache') return _dl_csv.projects
def get_table(self, spreadsheet_url): # get list of lists where each inner list is a row in a spreadsheet match = re.match(r'^https://docs.google.com/spreadsheets/d/(\S+)/', spreadsheet_url) # not sure this is doing anything? URLValidator picking this type of issue up already? if not match: raise ValidationError("Unable to extract key from Google Sheets URL") try: url = 'https://docs.google.com/spreadsheets/d/%s/export?format=csv' % match.group(1) response = requests.get(url, timeout=5) response.raise_for_status() except requests.RequestException as e: raise ValidationError("Error talking to Google Sheets: %s" % e.message) rows = csv.reader(io.BytesIO(response.content), encoding='utf-8') rows = list(rows) if not rows or not rows[0]: raise ValidationError("Your sheet did not import successfully; please check that it is 'Published to the web' and shared with 'Anyone with the link'") else: return rows
def start_char_addition(file_name, end_char, start_char): with open(file_name, 'rb') as fin: with open(file_name.split('.')[0] + '_startchar.csv', 'w+') as fout: with open(file_name.split('.')[0] + '_startchar_param.csv', 'w+') as paramout: reader = csvu.reader(fin, encoding='utf-8') writer = csvu.writer(fout, encoding='utf-8') writer.writerow([start_char]) sentences = 0 words = 0 for row in reader: writer.writerow(row) words = words + 1 if row == [end_char]: writer.writerow([start_char]) sentences = sentences + 1 par_writer = csv.writer(paramout) par_writer.writerow([words, sentences]) paramout.close() fout.close() fin.close()
def test_converts_partition_content_to_csv(self): # prepare partition mock fake_partition = MagicMock(spec=Partition) fake_partition.dataset.vid = 'ds1vid' fake_partition.datafile.headers = ['col1', 'col2'] fake_iter = lambda: iter([{'col1': '1', 'col2': '1'}, {'col1': '2', 'col2': '2'}]) fake_partition.__iter__.side_effect = fake_iter # run. ret = _convert_partition(fake_partition) # check converted partition. self.assertIn('package_id', ret) self.assertEqual(ret['package_id'], 'ds1vid') self.assertIn('upload', ret) self.assertTrue(isinstance(ret['upload'], six.StringIO)) rows = [] reader = unicodecsv.reader(ret['upload']) for row in reader: rows.append(row) self.assertEqual(rows[0], ['col1', 'col2']) self.assertEqual(rows[1], ['1', '1']) self.assertEqual(rows[2], ['2', '2'])
def process_data_csv(self, crowdsource): """Create the crowdsource data from the uploaded CSV""" url_validator = URLValidator() data_csv = self.cleaned_data['data_csv'] doccloud_each_page = self.cleaned_data['doccloud_each_page'] if data_csv: reader = csv.reader(data_csv) headers = [h.lower() for h in next(reader)] for line in reader: data = dict(zip(headers, line)) url = data.pop('url', '') doc_match = DOCUMENT_URL_RE.match(url) proj_match = PROJECT_URL_RE.match(url) if doccloud_each_page and doc_match: datum_per_page.delay( crowdsource.pk, doc_match.group('doc_id'), data, ) elif proj_match: import_doccloud_proj.delay( crowdsource.pk, proj_match.group('proj_id'), data, doccloud_each_page, ) elif url: # skip invalid URLs try: url_validator(url) except forms.ValidationError: pass else: crowdsource.data.create( url=url, metadata=data, )
def create(**kwargs): csv_file = kwargs.get('file') user = kwargs.get('user') dialect = kwargs.get('dialect', None) result = [] try: f = codecs.open(csv_file, 'rU') if not dialect: dialect = csv.excel() dialect.delimiter = ';' dialect.skipinitialspace = True csv_content = csv.reader(format_lines(f), dialect) template = kwargs.get('template') account = kwargs.get('account') while True: try: row = csv_content.next() data = {} i = 0 for item in row: i = i + 1 data.update({'field%s' % i: item.strip()}) if template == 'bradescopj': _persist_bradescopj(data, account, user) elif template == 'itaupf': _persist_itaupf(data, account, user) result.append(data) except StopIteration: break finally: f.close() return result
def run_import(self): zip_model = self.env['res.better.zip'] country_code = self.country_id.code config_url = self.env['ir.config_parameter'].get_param( 'geonames.url', default='http://download.geonames.org/export/zip/%s.zip') url = config_url % country_code logger.info('Starting to download %s' % url) res_request = requests.get(url) if res_request.status_code != requests.codes.ok: raise Warning( _('Got an error %d when trying to download the file %s.') % (res_request.status_code, url)) # Store current record list zips_to_delete = zip_model.search([('country_id', '=', self.country_id.id)]) f_geonames = zipfile.ZipFile(StringIO.StringIO(res_request.content)) tempdir = tempfile.mkdtemp(prefix='openerp') f_geonames.extract('%s.txt' % country_code, tempdir) logger.info('The geonames zipfile has been decompressed') data_file = open(os.path.join(tempdir, '%s.txt' % country_code), 'r') data_file.seek(0) logger.info('Starting to create the better zip entries') for row in unicodecsv.reader(data_file, encoding='utf-8', delimiter=' '): zip = self.create_better_zip(row, self.country_id) if zip in zips_to_delete: zips_to_delete -= zip data_file.close() if zips_to_delete: zips_to_delete.unlink() logger.info('%d better zip entries deleted for country %s' % (len(zips_to_delete), self.country_id.name)) logger.info('The wizard to create better zip entries from geonames ' 'has been successfully completed.') return True
def test_csv_export_extra_data(self): """Ensures exported CSV data matches source data""" qs_filter = {"pk__in": [x.pk for x in self.snapshots]} qs = BuildingSnapshot.objects.filter(**qs_filter) fields = list(_get_fields_from_queryset(qs)) fields.append("canonical_building__id") fields.append('my new field') export_filename = export_csv(qs, fields) export_file = open(export_filename) reader = csv.reader(export_file) header = reader.next() self.assertEqual(header[len(fields) - 1], 'my new field') for i in range(len(self.snapshots)): row = reader.next() for j in range(len(fields)): field = fields[j] components = field.split("__") qs_val = qs[i] for component in components: try: qs_val = getattr(qs_val, component) except AttributeError: qs_val = qs_val.extra_data.get(component) if qs_val == None: break if isinstance(qs_val, Manager) or qs_val == None: qs_val = u'' else: qs_val = unicode(qs_val) csv_val = row[j] self.assertEqual(qs_val, csv_val) export_file.close() os.remove(export_filename)
def simplify_sensor_log(sensor_log, readable=True): """ Translate the given sensor log in a symbols sequence, such that sequence classification techniques can be applied. Notice that, for the sake of readability, we allows only for a maximum number of distinct symbols equals to the size of English alphabet (that is enough according to the scope of this project). Then, in that case, a mapping between sensor ids and letters is automatically computed. :type sensor_log: file :param sensor_log: the tab-separated file containing the sensor log. :param readable: whether the mapping between sensor ids and letters has to be computed or not. """ file_basename = os.path.splitext(sensor_log.name)[0] dest = file_basename + '_simplified.txt' dest_dict = file_basename + '_simplified_dict.txt' src_reader = csv.reader(sensor_log, delimiter=LOG_ENTRY_DELIMITER) sensor_id_dict = {} with open(dest, 'w') as simplified_log: entry = next(src_reader, None) while entry is not None: sensor_id = entry[SENSOR_ID_POS] if readable: try: translation = sensor_id_dict[sensor_id] except KeyError: translation = SYMBOLS[len(sensor_id_dict)] sensor_id_dict[sensor_id] = translation else: translation = sensor_id simplified_log.write(translation + '\n') entry = next(src_reader, None) with open(dest_dict, 'w') as simplified_log_dict: for k, v in sensor_id_dict.items(): simplified_log_dict.write('%s \t\t %s\n' % (v, k))
def format_feature_tests(csv_filepath): """Read csv data, write formatted feature tests""" csvfile = open(csv_filepath) reader = csv.reader(csvfile) feature_filepath = csv_filepath.replace('.csv', '') feature_file = open(feature_filepath, 'w') feature_file.write(feature_file_header) reader.next() # discard header done = False col_widths = [] data = [] row = reader.next() try: while not done: filename = row[0] feature_file.write(examples_prefix + filename + '\n') rows = [] for row in reader: if row[0]: # next filename break rows.append(row) else: # no break, end of file done = True col_widths = fit_column_widths(rows) first_col = len(rows[0]) - len(col_widths) data = fieldnames[first_col:] # section heading feature_file.write(expand(data, col_widths)) for row_ in rows: data = row_[first_col:] line = expand(data, col_widths) feature_file.write(line.encode('utf8')) feature_file.write('\n') except StandardError as exc: print '*** Error formatting data: {}'.format(data) print ' using widths: {}'.format(col_widths) print ' in test for {}'.format(filename) raise
def get_csv(infile): sniff_range = 4096 sniffer = csv.Sniffer() dialect = sniffer.sniff(infile.read(sniff_range), delimiters=DELIMITERS) infile.seek(0) # Sniff for header header = sniffer.has_header(infile.read(sniff_range)) infile.seek(0) # get the csv reader reader = csv.reader(infile, dialect) firstrow = next(reader) colnames = [] for i, h in enumerate(firstrow): if len(h) > 0 and header: colnames.append(h) else: colnames.append('COLUMN{}'.format(i + 1)) if not header: infile.seek(0) return (reader, colnames)
def translate_dialog_to_lists(dialog_filename): """ Translates the dialog to a list of lists of utterances. In the first list each item holds subsequent utterances from the same user. The second level list holds the individual utterances. :param dialog_filename: :return: """ dialog_file = open(dialog_filename, 'r') dialog_reader = unicodecsv.reader(dialog_file, delimiter='\t',quoting=csv.QUOTE_NONE) # go through the dialog first_turn = True dialog = [] same_user_utterances = [] #last_user = None dialog.append(same_user_utterances) for dialog_line in dialog_reader: if first_turn: last_user = dialog_line[1] first_turn = False if last_user != dialog_line[1]: # user has changed same_user_utterances = [] dialog.append(same_user_utterances) same_user_utterances.append(dialog_line[3]) last_user = dialog_line[1] dialog.append([dialog_end_symbol]) return dialog
def _build_tcm(self, sensor_log, sensor_id_pos): """ Build the topological compatibility matrix associated with the given sensor log. :type sensor_log: file :param sensor_log: the tab-separated file containing the sensor log. :param sensor_id_pos: the position of the sensor id in the log entry. """ sensor_log_reader = csv.reader(sensor_log, delimiter=LOG_ENTRY_DELIMITER) s0 = next(sensor_log_reader, None) # consider a sliding window of two events per step s1 = next(sensor_log_reader, None) self.sensors_occurrences[s0[sensor_id_pos]] = 1 while s0 is not None and s1 is not None: s0_id = s0[sensor_id_pos] s1_id = s1[sensor_id_pos] # increase sensor occurrences try: self.sensors_occurrences[s1_id] += 1 except KeyError: self.sensors_occurrences[s1_id] = 1 # add sensors ids to matrix and update succession counter self._add_sensor(s0_id) self._add_sensor(s1_id) self.prob_matrix[s0_id][s1_id] += 1 # prepare next step (slide the window by one position) s0 = s1 s1 = next(sensor_log_reader, None) for s_row in self.prob_matrix: for s_col in self.prob_matrix[s_row]: if self.prob_matrix[s_row][s_col] != 0: # normalize cell value with respect to predecessor total occurrences self.prob_matrix[s_row][s_col] /= self.sensors_occurrences[s_row]
def test_writerow(): import os row = {} row['schema_name'] = u'test_schema_name' row['schema_title'] = u'test_schema_title' row['publish_date'] = u'2015-01-01' row['variable'] = u'test_var' row['title'] = u'test_title' row['description'] = u'test_desc' row['is_required'] = False row['is_system'] = False row['is_collection'] = False row['is_private'] = False row['field_type'] = False row['choices_string'] = u'0=test1;1=test2' row['order'] = 7 output_csv = open('output.csv', 'w') writer = csv.writer(output_csv, encoding='utf-8') iform_json.writerow(writer, row) output_csv.close() output_csv = open('output.csv', 'rb') reader = csv.reader(output_csv, encoding='utf-8') test_row = reader.next() assert test_row[0] == u'test_schema_name' assert test_row[3] == u'test_var' assert test_row[7] == u'False' assert test_row[12] == u'7' output_csv.close() os.remove('output.csv')
def import_food_items_constraints_csv_as_dict(in_filepath=None, schema=None): """ Import high-level goals from CSV and transform to dict objects Args: in_filepath: Input filepath containing the high-level goals description schema: Metadata description the input CSV schema Returns: List of dict objects describing the high level goals """ schema = schema or FOOD_ITEMS_CONSTRAINTS_CSV_SCHEMA in_filepath = in_filepath or 'data/food_items_constraints.csv' fp = open(in_filepath, "r") csv_reader = csv.reader(fp) header = next( csv_reader) if 'hasHeader' in schema and schema['hasHeader'] else None result = {} for data in csv_reader: result[data[schema['fields']['nbd_no']['index']]] = { 'max_qty': float(data[schema['fields']['max_qty']['index']]) } return result
def load_csv_dataset(filename): """ Loads a csv filename as a dataset :param str filename: name of the file :return List[DataSample]: a list of DataSample """ dataset = [] with open(os.path.join(DIR_GENERATED_DATA, filename), 'rb') as file: reader = csv.reader(file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL, errors='ignore') for row in reader: id = int(row[0]) text = row[1] gene = row[2] variation = row[3] try: real_class = int(row[4]) except: real_class = None dataset.append(DataSample(id, text, gene, variation, real_class)) return dataset
def test_sponsored_grants_csv(self): """ Verify that sponsored grant fields can be fetched in csv format Setup: No filters selected All fields selected Format = browse Asserts: Basic success: able to iterate through response with reader Number of rows in results matches number of awards (gp + sponsored) in db """ form = SponsoredAwardReportForm() post_dict = fill_report_form(form, select_fields=True, fmt='csv') post_dict['run-sponsored-award'] = '' response = self.client.post(self.url, post_dict) reader = unicodecsv.reader(response, encoding='utf8') row_count = sum(1 for row in reader) self.assertEqual(row_count - 2, models.SponsoredProgramGrant.objects.count())
def main(): ini_config = ConfigParser() ini_config.read(args.configfile) remote_ckan_url = ini_config.get('ckan', 'ckan.url') remote_apikey = ini_config.get('ckan', 'ckan.apikey') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN(remote_ckan_url, apikey=remote_apikey) fi = open(args.csvfile, 'r') fo = open(args.outfile, 'w') csv_in = unicodecsv.reader(fi, encoding='utf-8') csv_out = unicodecsv.writer(fo, encoding='utf-8') csv_out.writerow(csv_in.next()) for row in csv_in: # Look up the package in CKAN try: pkg = ckan_portal.action.package_show(id=row[0]) # If the record does not exist, then a NotFound exception will be thrown row[2] = pkg['org_title_at_publication'][args.lang] row[1] = pkg['title_translated'][args.lang] csv_out.writerow(row) except NotFound, e: pass
def _find_segments_old(self, sensor_log): """ Find segments in the given sensor log (old version). :type sensor_log: file :param sensor_log: the tab-separated file containing the sensor log. """ sensor_log_reader = csv.reader(sensor_log, delimiter=LOG_ENTRY_DELIMITER) s0 = next(sensor_log_reader, None) # consider a sliding window of two events per step s1 = next(sensor_log_reader, None) segment = [list(s0)] while s0 is not None and s1 is not None: s0_id = s0[self.sensor_id_pos] s1_id = s1[self.sensor_id_pos] if self.top_compat_matrix.prob_matrix[s0_id][ s1_id] >= self.compat_threshold: # the direct succession value is above the threshold segment.append(list(s1)) # continue the segment else: # the direct succession value is under the threshold if len( segment ) >= self.noise_threshold: # only segments longer than a threshold are considered self.segments.append( list(segment)) # store a copy of the segment so far segment = [ list(s1) ] # start the new segment from the second item in the window # prepare next step (slide the window by one position) s0 = s1 s1 = next(sensor_log_reader, None)
def parse_prev_consumption(_filename, _path): meter_data = {} with open(path.join(_path, _filename), 'r') as f: data = csv.reader(f, encoding="utf-8") x = 0 for row in data: if x == 0: x += 1 # skip first row continue meter_id = row[0] if meter_id: consumption = row[2] if not consumption: consumption = 0 consumption = float(consumption) """ meter_data = dict of dicts of arrays """ try: # find previously inserted value meter_data[meter_id] += consumption except KeyError: # add new meter data meter_data[meter_id] = consumption # when we create a HH we need a new username return meter_data