def __init__(self,refGene_fn) : refGene_f = open(refGene_fn) # check for header first_line = refGene_f.next() if not first_line.strip().startswith('#') : refGene_f.seek(0) # first line not header, reset the file pointer DictReader.__init__(self,refGene_f,delimiter='\t',fieldnames=RefGeneOutput.FIELD_NAMES)
def action(args): def newname(leaf, newname): leaf.name = newname return leaf tree = Phylo.parse(args.tree, args.tree_type).next() leafs = (leaf for leaf in tree.get_terminals()) if args.info: info = DictReader(args.info, fieldnames = ['seqname','newname']) info = {i['seqname']:i['newname'] for i in info} # for newick trees :s will be replaced by |s if args.tree_type == 'newick': info = {s.replace(':', '|'):n for s,n in info.items()} leafs = (l for l in leafs if l.name in info) leafs = (newname(l, info[l.name]) for l in leafs) if args.remove_word: leafs = (newname(l, re.sub(args.remove_word, '', l.name)) for l in leafs) leafs = (newname(l, l.name.strip()) for l in leafs) leafs = (newname(l, args.add_prefix + l.name) for l in leafs) leafs = (newname(l, l.name + args.add_suffix) for l in leafs) # do this last if args.tree_type == 'newick': leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs) # execute changes and write tree list(leafs) Phylo.write(tree, args.out, args.tree_type)
def test_subset_with_shapefile_no_ugid(self): """Test a subset operation using a shapefile without a UGID attribute.""" output_format = [constants.OUTPUT_FORMAT_NUMPY, constants.OUTPUT_FORMAT_CSV_SHAPEFILE] geom = self.get_shapefile_path_with_no_ugid() geom_select_uid = [8, 11] geom_uid = 'ID' rd = self.test_data.get_rd('cancm4_tas') for of in output_format: ops = OcgOperations(dataset=rd, geom=geom, geom_select_uid=geom_select_uid, geom_uid=geom_uid, snippet=True, output_format=of) self.assertEqual(len(ops.geom), 2) ret = ops.execute() if of == constants.OUTPUT_FORMAT_NUMPY: for element in geom_select_uid: self.assertIn(element, ret) self.assertEqual(ret.properties[8].dtype.names, ('STATE_FIPS', 'ID', 'STATE_NAME', 'STATE_ABBR')) else: with open(ret) as f: reader = DictReader(f) row = reader.next() self.assertIn(geom_uid, row.keys()) self.assertNotIn(env.DEFAULT_GEOM_UID, row.keys()) shp_path = os.path.split(ret)[0] shp_path = os.path.join(shp_path, 'shp', '{0}_gid.shp'.format(ops.prefix)) with fiona.open(shp_path) as source: record = source.next() self.assertIn(geom_uid, record['properties']) self.assertNotIn(env.DEFAULT_GEOM_UID, record['properties'])
def load_data(uri, dateFormat): logging.info('loading data; uri: {0}'.format(uri)) from urllib2 import urlopen from csv import DictReader reader = DictReader(urlopen(uri).readlines()) encodedFieldNames = [] for fieldname in reader.fieldnames: encodedFieldNames.append(fieldname.decode("utf-8-sig").encode("utf-8")) reader.fieldnames = encodedFieldNames data = [] from time import strptime for row in reader: data.append({ 'date': strptime(row['Date'], dateFormat), 'open': float(row['Open']), 'close': float(row['Close']), 'high': float(row['High']), 'low': float(row['Low']), 'volume': float(row['Volume']) }) return data
def __init__(self, fid, commentchar='#', *args, **kwds): if issubclass(DictReader, object): super(DictReader, self).__init__(fid, *args, **kwds) else: DictReader.__init__(self, fid, *args, **kwds) self.commentchar = commentchar self.leadingfield = self.commentchar + 'label'
def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect="excel", *args, **kw): DictReader.__init__(self, f, fieldnames=fieldnames, restkey=restkey, restval=restval, dialect=dialect, *args, **kw) # Replace the reader with our unicode-enabled reader. self.reader = UnicodeReader(f, dialect=dialect, *args, **kw)
def upload_resources(filename, skip=0, limit=None): """Upload from a CSV file.""" # Use sys.stdout.write so resources can be printed nicely and succinctly import sys date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d') bool_converter = lambda s: s == "true" resource_schema = facility_schema['fields'] convert_map = { 'integer': int, 'float': float, 'datetime': date_converter, 'boolean': bool_converter } convert = {} for k, v in resource_schema.items(): field_type = v.get('type') if convert_map.has_key(field_type): convert[k] = convert_map[field_type] def print_flush(msg): sys.stdout.write(msg) sys.stdout.flush() facility_code = facility_schema['facility_code'] print_every = 1000 print_flush("Adding resources. Please be patient.") with open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): actual_index = i + skip + 2 do_print = actual_index % print_every == 0 try: d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) coords = [d.pop('longitude', None), d.pop('latitude', None)] if coords[0] and coords[1]: d['location'] = {'type': 'Point', 'coordinates': coords} d['facility_code'] = facility_code if not check(add_document(facility_schema['endpoint'], d), 201, False): raise Exception() if do_print: print_flush(".") except Exception as e: print "Error adding resource", e pprint(d) exit() if limit and i >= limit: break # Create a 2dsphere index on the location field for geospatial queries app.data.driver.db['resources'].create_index([('location', '2dsphere')]) print "Resources uploaded!"
def upload_waterpoints(filename, skip=0, limit=None): """Upload waterpoints from a CSV file.""" date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d') bool_converter = lambda s: s == "true" status_map = { "non functional": "not functional", "functional needs repair": "needs repair" } status_converter = lambda s: status_map.get(s.lower(), s.lower()) convert = { 'gid': int, 'object_id': int, 'valid_from': date_converter, 'valid_to': date_converter, 'amount_tsh': float, 'breakdown_year': int, 'date_recorded': date_converter, 'gps_height': float, 'latitude': float, 'longitude': float, 'num_private': int, 'region_code': int, 'district_code': int, 'population': int, 'public_meeting': bool_converter, 'construction_year': int, 'status_group': status_converter } facility_code = "wpf001" with open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): print "Adding line", i + skip + 2 try: d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) coords = [d.pop('longitude'), d.pop('latitude')] d['location'] = {'type': 'Point', 'coordinates': coords} d['facility_code'] = facility_code if not check(add_document('waterpoints', d)): raise Exception() except Exception as e: print "Error adding waterpoint", e pprint(d) exit() if limit and i >= limit: break # Create a 2dsphere index on the location field for geospatial queries app.data.driver.db['facilities'].create_index([('location', '2dsphere')])
def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect="excel", encoding='utf-8', *args, **kw): DictReader.__init__(self, f, fieldnames=fieldnames, restkey=restkey, restval=restval, dialect=dialect, *args, **kw) if not encoding is None: f = Utf8Recoder(f, encoding=encoding) # Replace the reader with our unicode-enabled reader. self.reader = reader(f, dialect=dialect, *args, **kw)
def extractThresholdValues(fname): # parse csv file and add threshold values as dict # this method might be called multiple times for one item # There are various formats: # combined.modelEvaluation: Threshold Name, Testing.data, Cutoff, # Sensitivity, Specificity # biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*, # Sensitivity.*, Specificity.* # maxentResults.csv: Species,<various columns with interesting values> # <threshold name><space><cumulative threshold, # logistic threshold,area,training omission> # FIXME: this is really ugly and csv format detection should be done # differently thresholds = {} if fname.endswith("maxentResults.csv"): csvfile = open(fname, "r") dictreader = DictReader(csvfile) row = dictreader.next() # There is only one row in maxentResults namelist = ( "Fixed cumulative value 1", "Fixed cumulative value 5", "Fixed cumulative value 10", "Minimum training presence", "10 percentile training presence", "10 percentile training presence", "Equal training sensitivity and specificity", "Maximum training sensitivity plus specificity", "Balance training omission, predicted area and threshold value", "Equate entropy of thresholded and original distributions", ) for name in namelist: # We extract only 'cumulative threshold'' values threshold = "{} cumulative threshold".format(name) thresholds[threshold] = Decimal(row[threshold]) else: # assume it's one of our biomod/dismo results csvfile = open(fname, "r") dictreader = DictReader(csvfile) # search the field with Cutoff name = "Cutoff" for fieldname in dictreader.fieldnames: if fieldname.startswith("Cutoff."): name = fieldname break try: for row in dictreader: try: thresholds[row[""]] = Decimal(row[name]) except (TypeError, InvalidOperation) as e: LOG.warn( "Couldn't parse threshold value '%s' (%s) from" "file '%s': %s", name, row[name], fname, repr(e) ) except KeyError: LOG.warn("Couldn't extract Threshold '%s' from file '%s'", name, fname) return thresholds
def locations(rack_locations_path=RACKS_LOCATION_CSV): with open(rack_locations_path, 'r') as file: csv_file = DictReader(file, ["latitude", "longitude", "icon", "desc", "racks_count", "parking_places"]) acc = [] csv_file.__next__() # Skip the header for attributes in csv_file: acc.append(RacksLocation(attributes)) return acc
def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect="excel", encoding=None, *args, **kwds): BaseDictReader.__init__(self, f=f, fieldnames=fieldnames, restkey=restkey, restval=restval, dialect=dialect, *args, **kwds) from .csv import reader self.reader = reader(f, dialect=dialect, encoding=encoding, **kwds)
def __init__(self, csv): self.bag = Counter() reader = DictReader(open(csv, 'r'), fieldnames=[ "TileFile", "Borders", "Quantity", "Features", "Notes"]) reader.next() # skip header, we've defined our own for tile_dict in reader: tile = Tile.from_csv(tile_dict) quantity = int(tile_dict["Quantity"].strip()) self.bag[tile] = quantity if "B" in tile_dict["Features"]: self.first_tile = tile
def number1(): filename = '/home/apt9online/src/bslcks/jtest.csv' cong = DictReader(open(filename)) while True: p = cong.next() print cong.line_num if p['Include on directory'] == 'Yes': if p['Family relation'] <> 'Duplicate': try: Person.objects.get(bslc_individual=p['Indiv #']) print "%s %s already exists in the DB" % (p['First name'],p['Last name']) except: record_person(p)
def csvInput(file,options,dialect='excel'): header=options['header'] from csv import DictReader with open(file,'r') as f: if not header: reader = DictReader(f,dialect=dialect) else: reader = DictReader(f,dialect=dialect,fieldnames=header.split(',')) reader.fieldnames = map(options['alias'],reader.fieldnames) entries =[line for line in reader] map(lambda(dict): dict.update({"file":file, "format":fileType(file)}), entries) return entries
def test_write_csv(self): """TestBase: Base::write_csv() creates a valid csv""" from csv import DictReader fname = "thermal.csv" trappy.Run().thermal.write_csv(fname) with open(fname) as fin: csv_reader = DictReader(fin) self.assertTrue("Time" in csv_reader.fieldnames) self.assertTrue("temp" in csv_reader.fieldnames) first_data = csv_reader.next() self.assertEquals(first_data["Time"], "0.0") self.assertEquals(first_data["temp"], "68786")
def __init__(self, csvfile, fields=None, silent=False, **kwargs): self.csvfile = csvfile self.rows_imported = 0 self.errors = [] self.silent = silent if fields: if isinstance(fields[0], (list, tuple)): kwargs['fieldnames'] = [field[0] for field in fields] self.field_types = dict(fields) else: kwargs['fieldnames'] = fields self.field_types = dict.fromkeys(fields, None) DictReader.__init__(self, csvfile, **kwargs) else: DictReader.__init__(self, csvfile, **kwargs) self.field_types = dict.fromkeys(self.fieldnames, None)
def graceful_read_csv(filename): from csv import DictReader data = [] try: f = open(filename, 'rb') except IOError as e: print( "ERROR:", e.strerror ) exit() csvreader = DictReader(f) while True: try: row = csvreader.next() except: break data.append(row) return data
def __init__(self, filename, container = None, dialect = 'simplecsv'): self._container = None if isinstance(container, ObjectContainer): self._container = container self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect) elif isinstance(container, TupleContainer) or isinstance(container, ListContainer): self._container = container self._reader = csv.reader(filename, dialect = dialect) else: raise Exception("Given container is not valid")
class CSVUnicodeReader(object): def __init__(self, stream): self.reader = DictReader(UTF8Encoder(stream)) def __iter__(self): return self def next(self): entry = self.reader.next() return dict([(unicode(k, "utf-8"), unicode(v, "utf-8")) for (k,v) in entry.items()])
def test_append_field_err(form_config, form_data, log_path): """ Checks that error logs are correctly written and appended Submits three forms, the second two have different fields to the first and should be added to the same log file as each other, and be identical """ formmail.log_formdata(form_data, log_path) del form_data['email'] # submit two forms with fields that dont match the config # this should append the second form to the error log file with pytest.raises(Exception): formmail.log_formdata(form_data, log_path) with pytest.raises(Exception): formmail.log_formdata(form_data, log_path) with open(log_path + '_error') as error_log: reader = DictReader(error_log) assert reader.next() == form_data assert reader.next() == form_data
def clean_csv(self): """ ueberprueft, ob eine gueltige CSV-Datei hochgeladen wurde """ # erster Test: ist der Content-Type gueltig csv = self.cleaned_data['csv'] if csv.content_type != 'text/csv': self._errors['csv'] = self.error_class(['Nur CSV-Dateien sind als Eingabe erlaubt!']) return csv # zweiter Test: hat die Datei die richtige Anzahl Spalten? reader = DictReader(csv) try: entry = reader.next() if len(entry) != 12: msg = 'Ungültiges Format der CSV-Datei (falsche Anzahl Spalten)!' self._errors['csv'] = self.error_class([msg]) except StopIteration: msg = 'Ungültiges Format der CSV-Datei (keine Bestellungen vorhanden)!' self._errors['csv'] = self.error_class([msg]) orders = [entry] + [row for row in reader] return orders
def upload_waterpoints(filename, skip=0, limit=None): """Upload waterpoints from a gzipped CSV file.""" convert = { 'date_recorded': lambda s: datetime.strptime(s, '%m/%d/%Y'), 'population': int, 'construction_year': lambda s: datetime.strptime(s, '%Y'), 'breakdown_year': lambda s: datetime.strptime(s, '%Y'), 'amount_tsh': float, 'gps_height': float, 'latitude': float, 'longitude': float, } with gzip.open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) d['facility_code'] = 'wpf001' check(add_document('waterpoints', d)) if limit and i >= limit: break
def test_success(field, expected, log_path, response_for, form_data, sm_mock): key, value = field form_data[key] = value assert response_for(form_data, log=False) == expected assert sm_mock.call_count == 1 params = sm_mock.call_args[0][1]['fields'] assert set(params.keys()) == set(form_data.keys()) for key, value in form_data.items(): assert params[key] == value.decode('utf8') assert response_for(form_data, log=True) == expected assert sm_mock.call_count == 2 assert response_for(form_data, log=True) == expected assert sm_mock.call_count == 3 with open(log_path) as log_file: reader = DictReader(log_file) row = reader.next() # rows should not be equal because the time field # is added by the logging function. assert row != reader.next()
def next(self): row = DictReader.next(self) try: processed_row = dict( (key, convert(value, self.field_types[key])) for key, value in row.iteritems() ) except ValueError as e: self.errors.append((e, row)) if not self.silent: raise e else: self.rows_imported += 1 return processed_row
class StructuredReader(object): def __init__(self, filename, container = None, dialect = 'simplecsv'): self._container = None if isinstance(container, ObjectContainer): self._container = container self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect) elif isinstance(container, TupleContainer) or isinstance(container, ListContainer): self._container = container self._reader = csv.reader(filename, dialect = dialect) else: raise Exception("Given container is not valid") def next(self): # do not treat the header row if self._reader.line_num == 0: self._reader.next() row = self._reader.next() return self._container.fetch(row) def __iter__(self): return self
def dictcsv(csvname, fieldnames = None, arrays = False): """Reading csv files into a dictionary. Arguments: csvname: string filename Keyword Arguments: fieldnames: list of csv column names. If none, first column of the file being read will be used. arrays: Whether or not to return csv contents as a dict of arrays Returns: dictionary of columns as numpy arrays, keys are fieldnames """ fileobj = open(csvname, 'rU') DR = DictReader(fileobj, fieldnames = fieldnames) fields = DR.fieldnames l = DR.next() dicty = {} for f in fields: try: dicty[f] = [float(l[f])] except (TypeError, ValueError): dicty[f] = [l[f]] for row in DR: for f in fields: try: dicty[f].append(float(row[f])) except (TypeError, ValueError): dicty[f].append(row[f]) if arrays: for key in dicty: dicty[key] = np.array(dicty[key]) return dicty
def __init__(self, csvfile, casts, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds): """Arguments: - f: An iterable object such as as file. Passed on to csv.DictReader - casts: A dict mapping from attribute names to functions to apply to these names, e.g., {'id':int, 'salary':float} - fieldnames: Passed on to csv.DictReader - restkey: Passed on to csv.DictReader - restval: Passed on to csv.DictReader - dialect: Passed on to csv.DictReader - *args: Passed on to csv.DictReader - **kwds: Passed on to csv.DictReader """ DictReader.__init__(self, csvfile, fieldnames=fieldnames, restkey=restkey, restval=restval, dialect=dialect, *args, **kwds) if not type(casts) == dict: raise TypeError("The casts argument must be a dict") for v in casts.values(): if not callable(v): raise TypeError("The values in casts must be callable") self._casts = casts
def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None, dialect_in="excel", dialect_out="excel"): self.f_in = open(fnm_in) self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in) field_names = self.csv_dict_reader.fieldnames if len(field_names) <> len(self.defined_input_field_names): raise ValueError,\ ("incorrect number of columns in the file %s, it should have %d columns" % (fnm_in, len(self.defined_input_field_names))) if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]: raise ValueError,\ ("incorrect names of columns in the file %s, they should be %s" % (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names)))) self.f_out = open(fnm_out, 'w') self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out)
class BaseCSVHandler(object): defined_input_field_names = ['date','customer','money'] defined_output_field_names = ['date','customer','money'] result = [] def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None, dialect_in="excel", dialect_out="excel"): self.f_in = open(fnm_in) self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in) field_names = self.csv_dict_reader.fieldnames if len(field_names) <> len(self.defined_input_field_names): raise ValueError,\ ("incorrect number of columns in the file %s, it should have %d columns" % (fnm_in, len(self.defined_input_field_names))) if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]: raise ValueError,\ ("incorrect names of columns in the file %s, they should be %s" % (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names)))) self.f_out = open(fnm_out, 'w') self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out) def __iter__(self): return self def one_string_handler(self,s): if s: self.result.append (s) def next(self): return self.csv_dict_reader.next() def calc_result(self): pass def write_result(self): self.csv_dict_writer.writeheader() self.csv_dict_writer.writerows(self.result) def close_all_files(self): self.f_in.close() self.f_out.close() self.csv_dict_writer = None self.csv_dict_reader = None def process_all(self): for i in self: self.one_string_handler(i) self.calc_result() self.write_result() self.close_all_files()
type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument('--subsample', type=float, default=1.0, help='subsample this fraction of total') args = parser.parse_args() trainfile = prepfile(args.trainfile, 'r') if args.testfile is not None: testfile = prepfile(args.testfile, 'r') else: testfile = None outfile = prepfile(args.outfile, 'w') # Create feature extractor (you may want to modify this) trainn = DictReader(trainfile, delimiter='\t') fe = FeatureExtractor() # Read in training data train = DictReader(trainfile, delimiter='\t') pss = PorterStemmer() # Split off dev section dev_train = [] dev_test = [] full_train = [] p = 0 cc = 0 s1, s2, s3, s4, s5 = 0, 0, 0, 0, 0 b1, b2, b4, b3, b5 = 0, 0, 0, 0, 0
def combine_split_chained_results(output_prefixes, final_prefix, ref_gff, ref_group, ref_name, ref_fq, addon_gff, addon_group, addon_name, addon_fq): """ Each <output_prefix> will have .gff, .group.txt, .mega_info.txt. There should be NO overlap between the split files, so clean merge should be possible! 1. read the .gff files, record the group and mega (id-map) info 2. sort the total records so can properly put on a unified superPBID 3. write out the unified result 4. delete the split files """ # sanity check files are all there split_files = [] # tuple of (gff, group, mega) for ref_name, o in output_prefixes: gff_file = 'tmp_' + o + '.gff' mega_file = 'tmp_' + o + '.mega_info.txt' group_file = 'tmp_' + o + '.group.txt' if not os.path.exists(gff_file) or not os.path.exists( mega_file) or not os.path.exists(group_file): print( "Expects to see {0},{1},{2} but one or more files are missing! Abort!" .format(gff_file, mega_file, group_file), file=sys.stderr) sys.exit(-1) split_files.append((ref_name, o, gff_file, group_file, mega_file)) use_fq = False if ref_fq is not None and addon_fq is not None: use_fq = True ref_fq_dict = dict((r.id.split('|')[0], r) for r in SeqIO.parse(open(ref_fq), 'fastq')) addon_fq_dict = dict((r.id.split('|')[0], r) for r in SeqIO.parse(open(addon_fq), 'fastq')) mega_info = {} # ref id -> list of matching query_id, or empty list split_unmatched = set() for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: for r in DictReader(open(mega_file), delimiter='\t'): if r[ref_name] != 'NA': if r[ref_name] not in mega_info: mega_info[r[ref_name]] = [] if r[split_name] != 'NA': mega_info[r[ref_name]].append(r[split_name]) else: # ref is NA, non-ref is not NA split_unmatched.add(r[split_name]) # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both rec_list = [] d_ref = dict((r.seqid, r) for r in GFF.collapseGFFReader(ref_gff)) d_addon = dict((r.seqid, r) for r in GFF.collapseGFFReader(addon_gff)) ref_group_info = sp.MegaPBTree.read_group(ref_group, None) addon_group_info = sp.MegaPBTree.read_group(addon_group, None) for ref_id, matches in mega_info.items(): if len(matches) == 0: rec_list.append( sp.MatchRecord(ref_id=ref_id, addon_id='NA', rec=d_ref[ref_id], members=ref_group_info[ref_id], seqrec=ref_fq_dict[ref_id] if use_fq else None)) else: for addon_id in matches: r1 = d_ref[ref_id] r2 = d_addon[addon_id] if (r1.end - r1.start) > (r2.end - r2.start): rec_list.append( sp.MatchRecord( ref_id=ref_id, addon_id=addon_id, rec=r1, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=ref_fq_dict[ref_id] if use_fq else None)) else: rec_list.append( sp.MatchRecord(ref_id=ref_id, addon_id=addon_id, rec=r2, members=ref_group_info[ref_id] + addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None)) for addon_id in split_unmatched: rec_list.append( sp.MatchRecord(ref_id='NA', addon_id=addon_id, rec=d_addon[addon_id], members=addon_group_info[addon_id], seqrec=addon_fq_dict[addon_id] if use_fq else None)) sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq) for (ref_name, split_name, gff_file, group_file, mega_file) in split_files: os.remove(gff_file) os.remove(group_file) os.remove(mega_file)
from glob import glob import re from csv import DictReader from collections import defaultdict def inner_dict_init(): return defaultdict(float) protocols = defaultdict(inner_dict_init) for infile in glob('TLS1_bank/*'): with open(infile, 'r') as of: reader = DictReader(of) for row in reader: for p in row: if not p: continue item = row[p] cipher, value = item.split() cipher = cipher.translate(None, "{'}:") value = value.translate(None, "{}, ''") protocols[p][cipher] += float(value) # print protocols res = {} for p in protocols: a = sorted(protocols[p], key=lambda x: protocols[p][x], reverse=True) res[p] = [(i, protocols[p][i]) for i in a][:5] res[p].append(('other', sum([protocols[p][i] for i in a][5:])))
def read_raven_selection_table(cls, tbl_path): ''' Given the path to a Raven selection table .csv file, return a list of dicts. Each dict contains the information in one selection table row, plus two additional entries: time_interval, and freq_interval; these are added for convenience: Selection row number <int> View not used Channel not used Begin Time (s) begin of vocalization in fractional seconds <float> End Time (s) end of vocalization in fractional seconds <float> Low Freq (Hz) lowest frequency within the lassoed vocalization <float> High Freq (Hz) highest frequency within the lassoed vocalization <float> species four-letter species name <str> type {song, call, call-1, call-trill} <str> number not used mix comma separated list of other audible species [<str>] time_interval Inteval instance from start and end times freq_interval Inteval instance from start and end frequencies Values are converted to appropriate types as above. Output is suitable for SnippetSelectionTableMapper.match_snippets() The list will be sorted by ascending the 'Begin Time (s)' value. :param tbl_path: full path to Raven selection table :type tbl_path: src :return: list of dicts, each dict reflecting the content of one selection table row :rtype [str : Any] ''' with open(tbl_path, 'r') as sel_tbl_fd: reader = DictReader(sel_tbl_fd, delimiter='\t') sel_dict_list = [row_dict for row_dict in reader] # Coerce types: for sel_dict in sel_dict_list: sel_dict['Selection'] = str(sel_dict['Selection']) sel_dict['Begin Time (s)'] = float(sel_dict['Begin Time (s)']) sel_dict['End Time (s)'] = float(sel_dict['End Time (s)']) sel_dict['Low Freq (Hz)'] = float(sel_dict['Low Freq (Hz)']) sel_dict['High Freq (Hz)'] = float(sel_dict['High Freq (Hz)']) # Turn the comma-separated list of # overlapping vocalizations into # a (possibly empty) list of strings: sel_dict['mix'] = [] if len( sel_dict['mix']) == 0 else sel_dict['mix'].split(',') # Clean out spurious white space: sel_dict['mix'] = [ mix_list_entry.strip() for mix_list_entry in sel_dict['mix'] ] # Remove empty mixes: #**** sel_dict['time_interval'] = Interval(sel_dict['Begin Time (s)'], sel_dict['End Time (s)']) sel_dict['freq_interval'] = Interval(sel_dict['Low Freq (Hz)'], sel_dict['High Freq (Hz)']) # Make sure the list is sorted by # ascending start time: sel_dict_list_sorted = natsorted( sel_dict_list, key=lambda row_dict: row_dict['Begin Time (s)']) return sel_dict_list_sorted
def chain_samples_multithread(dirs, names, group_filename, gff_filename, count_filename, field_to_use='count_fl', fuzzy_junction=0, allow_5merge=False, max_3_diff=100, fastq_filename=None, cpus=4): for d in dirs.values(): sample_sanity_check(os.path.join(d, group_filename),\ os.path.join(d, gff_filename),\ os.path.join(d, count_filename),\ os.path.join(d, fastq_filename) if fastq_filename is not None else None) count_header, count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith('tmp_'): chain = [] for start_i, name in enumerate(names): if name.startswith('tmp_'): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it first_add = False else: # everything is new, start fresh name = names[0] chain = [name] start_i = 1 first_add = True for addon_name in names[start_i:]: assert not addon_name.startswith('tmp_') ref_name = chain[-1] ref_d = dirs[ref_name] if first_add: ref_gff = os.path.join(ref_d, gff_filename) ref_group = os.path.join(ref_d, group_filename) ref_fq = os.path.join( ref_d, fastq_filename) if fastq_filename is not None else None else: ref_name = 'tmp_' + ref_name ref_gff = ref_name + '.gff' ref_group = ref_name + '.group.txt' ref_fq = ref_name + '.rep.fq' if fastq_filename is not None else None addon_d = dirs[addon_name] addon_gff = os.path.join(addon_d, gff_filename) addon_group = os.path.join(addon_d, group_filename) addon_fq = os.path.join( addon_d, fastq_filename) if fastq_filename is not None else None split_outs, split_ins = chain_split_file(ref_gff=ref_gff, ref_group=ref_group, ref_name=ref_name, addon_gff=addon_gff, addon_group=addon_group, addon_name=addon_name, fuzzy_junction=fuzzy_junction, allow_5merge=allow_5merge, max_3_diff=max_3_diff, n_chunks=cpus) combine_split_chained_results(split_outs, final_prefix='tmp_' + addon_name, ref_gff=ref_gff, ref_group=ref_group, ref_name=ref_name, ref_fq=ref_fq, addon_gff=addon_gff, addon_group=addon_group, addon_name=addon_name, addon_fq=addon_fq) chain.append(addon_name) for in_gff_split, in_group_split in split_ins: os.remove(in_gff_split) # remove the split gff os.remove(in_group_split) first_add = False # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_sample1, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open('tmp_' + c + '.mega_info.txt'), delimiter='\t'): d['tmp_' + c, r['superPBID']] = r f1 = open('all_samples.chained_ids.txt', 'w') writer1 = DictWriter(f1, fieldnames=['superPBID'] + chain, delimiter='\t') writer1.writeheader() f2 = open('all_samples.chained_count.txt', 'w') writer2 = DictWriter(f2, fieldnames=['superPBID'] + chain, delimiter='\t') writer2.writeheader() reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'), delimiter='\t') for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: 'NA') # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: 'NA') # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != 'NA': answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r['tmp_' + c] == 'NA': saw_NA = True break else: r2 = d['tmp_' + c, r['tmp_' + c]] answer[c] = r2[c] if answer[c] != 'NA': answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != 'NA': answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] rec1 = {'superPBID': r0['superPBID']} rec2 = {'superPBID': r0['superPBID']} for c in chain: rec1[c] = answer[c] rec2[c] = str(answer2[c]) writer1.writerow(rec1) writer2.writerow(rec2) f1.close() f2.close() shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff') if fastq_filename is not None: shutil.copyfile('tmp_' + chain[-1] + '.rep.fq', 'all_samples.chained.rep.fq') print("Chained output written to:", file=sys.stdout) print("all_samples.chained.gff", file=sys.stdout) print(f1.name, file=sys.stdout) print(f2.name, file=sys.stdout) if fastq_filename is not None: print("all_samples.chained.rep.fq", file=sys.stdout)
def readCSV(filename): with open(filename) as inFile: drObject = DictReader(inFile) return list(drObject)
def main(): parser = argparse.ArgumentParser() parser.add_argument('performance_files', type=Path, nargs='+') parser.add_argument('--output-dir', type=Path) args = parser.parse_args() performance = [] for f in args.performance_files: with open(f) as fp: performance.extend(list(DictReader(fp))) cols = set() for p in performance: cols.update(p.keys()) data_dicts = defaultdict(list) for p in performance: for c in cols: v = p.get(c, None) data_dicts[c].append(v) df = pd.DataFrame(data_dicts) nwp_models = df['nwp_model'].unique() mae = df['mean_absolute_error'] df['mean_absolute_error'] = pd.to_numeric(df['mean_absolute_error'], errors='coerce') df = df.dropna(axis=0, how='any', subset=['mean_absolute_error']) mae_min = df['mean_absolute_error'].min() mae_max = df['mean_absolute_error'].max() models = df['model'].unique() fig, axes = plt.subplots(len(nwp_models), len(models), sharex='col', squeeze=False, sharey='all') for i, nwp_model in enumerate(sorted(nwp_models)): for j, model in enumerate(sorted(models)): ax = axes[i, j] model_df = df.loc[np.logical_and(df['nwp_model'] == nwp_model, df['model'] == model)] try: sns.boxplot(data=model_df, x='fold_id', y='mean_absolute_error', ax=ax) except ValueError: continue ax.set_title(nwp_model) if i == len(nwp_models) - 1: ax.set_xlabel('Test fold id') else: ax.set_xlabel('') if i == 0: ax.text(0.5, 1.3, model, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, fontdict=dict(size=18)) if j == 0: ax.set_ylabel('Mean absolute error') else: ax.set_ylabel('') if args.output_dir is not None: save_path = args.output_dir / f'nwp_compare.png' plt.savefig(save_path) plt.show()
def read_content(filename): with open(filename, newline='') as fp: reader = DictReader(fp) return list(reader)
'ces': 'Czech', 'deu': 'German', 'fra': 'French', 'fre': 'French', 'esn': 'Spanish', 'fin': 'Finnish', 'rus': 'Russian', 'hin': 'Hindi', 'eng': 'English', 'ron': 'Romanian', 'tur': 'Turkish', 'Portguese': 'Portuguese' } if __name__ == "__main__": args = PARSER.parse_args() if not args.inter_annotator_agreement and \ not args.intra_annotator_agreement: print("Defaulting to --inter mode.") args.inter_annotator_agreement = True results_data = defaultdict(lambda: defaultdict(list)) for i, row in enumerate(DictReader(args.results_file)): src_lang = row.get('srclang') if src_lang in list(LANGUAGE_CODE_TO_NAME.keys()): src_lang = LANGUAGE_CODE_TO_NAME[src_lang] trg_lang = row.get('trglang') if trg_lang in list(LANGUAGE_CODE_TO_NAME.keys()): trg_lang = LANGUAGE_CODE_TO_NAME[trg_lang] language_pair = '{0}-{1}'.format(src_lang, trg_lang) segment_id = int(row.get('srcIndex')) judge_id = row.get('judgeId') if not judge_id: judge_id = row.get('judgeID') # Filter out results where a user decided to "skip" ranking.
a = sorted(lst, key=lambda x: x[1]) return a[1:16] dct = { 'AS': 'Asia', 'EU': 'Europe', 'AF': 'Africa', 'OC': 'Oceania', 'NA': 'North America', 'AN': 'Antarctica', 'SA': 'South America' } with open('countries.csv') as f: reader = list(DictReader(f)) for i in reader: with open('%s.html' % i['short_name'], 'w') as myFile: myFile.write('<html>\n') myFile.write('<head>\n') myFile.write('\t<title>%s</title>\n' % i['name']) myFile.write('</head>\n') a = i['short_name'] myFile.write( '<img src="http://www.crwflags.com/fotw/images/{}/{}.gif">\n'. format(a[0].lower(), a.lower())) myFile.write('<h1>%s</h1>\n' % i['name']) myFile.write('<dl>\n') myFile.write('\t<dt>Capital</dt>\n')
from csv import DictReader data_rdr = DictReader(open('../data-wrangling/data/unicef/mn.csv', 'rt')) header_rdr = DictReader( open('../data-wrangling/data/unicef/mn_headers.csv', 'rt')) data_rows = [d for d in data_rdr] header_rows = [h for h in header_rdr] new_rows = [] for data_dict in data_rows: new_row = {} for dkey, dval in data_dict.items(): for header_dict in header_rows: if dkey in header_dict.values(): new_row[header_dict.get('Label')] = dval new_rows.append(new_row) print(new_rows[0])
or implied. ''' import requests, yaml, json from flask import Flask, request, redirect, url_for, render_template from csv import DictReader cred = yaml.safe_load(open("credentials.yml")) WT_ADMIN_TOKEN = cred['WT_ADMIN_TOKEN'] MERAKI_KEY = cred['MERAKI_KEY'] MERAKI_NETWORK_ID = cred['MERAKI_NETWORK_ID'] # Room Data, from csv file MerakiCamera_to_WebexRoomKitMini = [] with open('MerakiCameras_to_WebexRoomKitMini_Pairing.csv', 'r') as read_obj: csv_dict_reader = DictReader(read_obj) for row in csv_dict_reader: row['Room_Name'] = row.pop('Room_Name') MerakiCamera_to_WebexRoomKitMini.append(row) # note the use case is developed with only one patient room available, requires iterations if multiple rooms are listed in MerakiCameras_to_WebexRoomKitMini_Pairing.csv MERAKI_SN = MerakiCamera_to_WebexRoomKitMini[0]['Meraki_SN'] ROOM_NAME = MerakiCamera_to_WebexRoomKitMini[0]['Room_Name'] ROOMKIT_ID = MerakiCamera_to_WebexRoomKitMini[0]['Webex_RoomKitMini_ID'] SIP_URL = MerakiCamera_to_WebexRoomKitMini[0]['Webex_RoomKitMini_SIP'] app = Flask(__name__) @app.route('/') def pop_up(): snapshot_url = "https://api.meraki.com/api/v0/networks/{1}/cameras/{0}/snapshot".format(
from csv import DictReader import pickle data = list(DictReader(open("diabetes.csv", "rt"))) data1 = open("diabetes.pkl", "wb") pickle.dump(data, data1) data1.close() data1 = open("diabetes.pkl", "rb") pickle.load(data1) for i in data: print(i)
#!/usr/bin/python import model from model import db from io import open from csv import DictReader db.drop_all() db.create_all() with open('data/movies.csv', 'r', encoding='utf-8-sig') as movies_file: reader = DictReader(movies_file) for row in reader: new_movie = model.Movie(name=row['name'], year=row['year']) actors = row['actors'].split(';') for actor in actors: print(actor) existing_actor = model.Actor.query.filter_by(name=actor).first() if (existing_actor): existing_actor.movies.append(new_movie) new_movie.actors.append(existing_actor) else: new_actor = model.Actor(name=actor) new_actor.movies.append(new_movie) new_movie.actors.append(new_actor) db.session.add(new_actor) db.session.add(new_movie) with open('data/songs.csv', 'r', encoding='utf-8-sig') as songs_file:
def _detect_header(self): with open(self._file) as csv_file: reader = DictReader(csv_file) self.header = reader.fieldnames
from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split import os import re import nltk import numpy as np from sklearn import feature_extraction from tqdm import tqdm from nltk.tokenize import word_tokenize, sent_tokenize from sklearn.feature_extraction.text import CountVectorizer # In[3]: #load train data f_train_bodies = open(data_path + '/' + 'x_train.csv', 'r', encoding='utf-8') X_train_csv = DictReader(f_train_bodies) X_train_data = list(X_train_csv) col_name = ['Headline', 'Body', 'Stance'] X_train_headline = [x[col_name[0]] for x in X_train_data] X_train_body = [x[col_name[1]] for x in X_train_data] Y_train = [y[col_name[2]] for y in X_train_data] y_train = Y_train + Y_train # In[4]: # Read the text files of fnc data X_train_df = pd.read_csv(data_path + '/' + 'x_train.csv') X_test_df = pd.read_csv(data_path + '/' + 'x_test.csv') # In[5]:
def prepare_distinct(path, out, embedder): print path c = 0 start = datetime.now() with open(out, 'w') as outfile: columns = [ 'w2v_sim_mean', 'w2v_sim_max', 'w2v_sim_min', 'w2v_sim_std', 'w2v_dist_mean', 'w2v_dist_max', 'w2v_dist_min', 'w2v_dist_std', ] columns = ','.join(columns) outfile.write(columns + '\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1 = remove_punctuation(str(row['question1']).lower()) q2 = remove_punctuation(str(row['question2']).lower()) # print q1,q2 q1, q2 = distinct_terms(q1, q2) # print q1,"_______",q2 a2 = [x for x in q1.split(' ') if x in embedder.vocab] b2 = [x for x in q2.split(' ') if x in embedder.vocab] # print a2,b2 sims = [] dists = [] if len(a2) == 0 or len(b2) == 0: sims = [0.0] dists = [0.0] else: for i in range(len(a2)): for j in range(len(b2)): try: worda = a2[i] wordb = b2[j] if worda == "" or wordb == "": continue sim = embedder.n_similarity(worda, wordb) vector_diff = embedder[worda] - embedder[wordb] dist = np.sqrt(np.sum(vector_diff**2)) sims.append(sim) dists.append(dist) except Exception, e: # print e continue if len(sims) == 0 or len(dists) == 0: sims = [0.0] dists = [0.0] w2v_sim_mean = np.mean(sims) w2v_sim_max = np.max(sims) w2v_sim_min = np.min(sims) w2v_sim_std = np.std(sims) w2v_dist_mean = np.mean(dists) w2v_dist_max = np.max(dists) w2v_dist_min = np.min(dists) w2v_dist_std = np.std(dists) features = ( w2v_sim_mean, w2v_sim_max, w2v_sim_min, w2v_sim_std, w2v_dist_mean, w2v_dist_max, w2v_dist_min, w2v_dist_std, ) outfile.write('%s,%s,%s,%s,%s,%s,%s,%s\n' % features) c += 1 end = datetime.now()
def load_label(path, label): result = [] for row in DictReader(open(path)): if int(row['Class']) == label: result.append((row['Id'])) return result
print('day{0} completed!'.format(this)) # print(line) train.write(','.join('{0}'.format(line[i]) for i in TRAIN)+'\n') test.write(','.join('{0}'.format(line[i]) for i in TEST)+'\n') validation.write('{0},{1}\n'.format(line['Id'],line['Label'])) #t+=1 #day_split() day_split2() last=0 dataset=open('day1.vw') validation=open('day1_val.csv') for e, row in enumerate(DictReader(open(input)) ): categorical_features = [] for k,v in row.items(): if k not in ['Label','Id']: if len(str(v)) > 0 : categorical_features.append('{0}.{1}'.format(k,v)) #这里连‘-’导致测试不通过,很奇怪,无视之 if(type=='-train'): if row['Label']=='1': label = 1 else: label = -1 else: if validation.readline().strip().split(',')[1] =='1': label = 1 else:
# writer # DictWriter from csv import writer with open('file2.csv', 'w', newline='') as f: csv_writer = writer(f) # csv_writer.writerow(['name','country']) # csv_writer.writerow(['nateq','india']) # csv_writer.writerow(['ahmed','india']) csv_writer.writerows([['name', 'country'], ['nateq', 'india'], ['ahmed', 'india']]) from csv import DictReader with open('file2.csv', 'r') as rf: csv_reader = DictReader(rf) for row in csv_reader: print(row)
from csv import DictReader train = 'data/train_df_app_smooth.csv' # path to training file test = 'data/test_df_app_smooth.csv' #train = 'data/train_df_site_smooth.csv' # path to training file #test = 'data/test_df_site_smooth.csv' # -- train data -- # # list(test_df.columns.values) start = datetime.now() with open('data/train_df_app_smooth_ex_id.csv', "wb") as outfile: outfile.write( 'id,click,hour,C1,banner_pos,app_domain,app_category,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21\n' ) for t, row in enumerate(DictReader(open(train))): # turn hour really into hour, it was originally YYMMDDHH ID = row['id'] click = row['click'] hour = row['hour'] C1 = row['C1'] banner_pos = row['banner_pos'] app_domain = row['app_domain'] app_category = row['app_category'] #device_id = row['device_id'] device_ip = row['device_ip'] device_model = row['device_model'] device_type = row['device_type'] device_conn_type = row['device_conn_type'] C14 = row['C14']
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: i = m.find('|') if i > 0: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 else: tmp = m.split('/')[1] fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.iteritems(): print k, v f.close() # group_max_count_p NOT used for now good = filter( lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'
def read_quotes(filename): with open(filename, 'r', encoding='utf-16') as file: csv_reader = DictReader(file) return list(csv_reader)
def test_export( self, es_with_collector, request_sortby, orm_ordering, requests_mock, accepts_dit_email_marketing, ): """Test export of contact search results.""" ArchivedContactFactory() ContactWithOwnAddressFactory() ContactFactory() # These are to test date of and team of latest interaction a bit more thoroughly CompanyInteractionFactory.create_batch(2) CompanyInteractionFactory(contacts=ContactFactory.create_batch(2)) interaction_with_multiple_teams = CompanyInteractionFactory() InteractionDITParticipantFactory.create_batch( 2, interaction=interaction_with_multiple_teams, ) es_with_collector.flush_and_refresh() data = {} if request_sortby: data['sortby'] = request_sortby url = reverse('api-v3:search:contact-export') with freeze_time('2018-01-01 11:12:13'): response = self.api_client.post(url, data=data) assert response.status_code == status.HTTP_200_OK assert parse_header(response.get('Content-Type')) == ('text/csv', { 'charset': 'utf-8' }) assert parse_header(response.get('Content-Disposition')) == ( 'attachment', { 'filename': 'Data Hub - Contacts - 2018-01-01-11-12-13.csv' }, ) sorted_contacts = Contact.objects.annotate( computed_address_country_name=Coalesce( 'address_country__name', 'company__address_country__name', ), ).order_by( orm_ordering, 'pk', ) matcher = requests_mock.post( f'{settings.CONSENT_SERVICE_BASE_URL}' f'{CONSENT_SERVICE_PERSON_PATH_LOOKUP}', text=generate_hawk_response({ 'results': [{ 'email': contact.email, 'consents': [ CONSENT_SERVICE_EMAIL_CONSENT_TYPE, ] if accepts_dit_email_marketing else [], } for contact in sorted_contacts], }), status_code=status.HTTP_200_OK, ) reader = DictReader(StringIO(response.getvalue().decode('utf-8-sig'))) assert reader.fieldnames == list( SearchContactExportAPIView.field_titles.values()) expected_row_data = [{ 'Name': contact.name, 'Job title': contact.job_title, 'Date created': contact.created_on, 'Archived': contact.archived, 'Link': f'{settings.DATAHUB_FRONTEND_URL_PREFIXES["contact"]}/{contact.pk}', 'Company': get_attr_or_none(contact, 'company.name'), 'Company sector': get_attr_or_none(contact, 'company.sector.name'), 'Company link': f'{settings.DATAHUB_FRONTEND_URL_PREFIXES["company"]}/{contact.company.pk}', 'Company UK region': get_attr_or_none(contact, 'company.uk_region.name'), 'Country': contact.company.address_country.name if contact.address_same_as_company else contact.address_country.name, 'Postcode': contact.company.address_postcode if contact.address_same_as_company else contact.address_postcode, 'Phone number': ' '.join( (contact.telephone_countrycode, contact.telephone_number)), 'Email address': contact.email, 'Accepts DIT email marketing': accepts_dit_email_marketing, 'Date of latest interaction': max(contact.interactions.all(), key=attrgetter('date')).date if contact.interactions.all() else None, 'Teams of latest interaction': _format_interaction_team_names( max(contact.interactions.all(), key=attrgetter('date')), ) if contact.interactions.exists() else None, 'Created by team': get_attr_or_none(contact, 'created_by.dit_team.name'), } for contact in sorted_contacts] actual_row_data = [dict(row) for row in reader] assert actual_row_data == format_csv_data(expected_row_data) assert matcher.call_count == 1 assert matcher.last_request.json() == { 'emails': [contact.email for contact in sorted_contacts], }
def gather_experiment_data(experiment, lead_time_interval=None): experiment_data = dict() metadata_path = experiment / 'metadata.json' if metadata_path.exists(): with open(metadata_path) as fp: metadata = json.load(fp) site_dataset_path = Path(metadata['experiment_config']['site_dataset_path']) site_id = get_site_id(site_dataset_path) nwp_model = get_nwp_model_from_path(site_dataset_path) experiment_data['site_id'] = site_id experiment_data['nwp_model'] = nwp_model.identifier try: experiment_data['model'] = metadata['model_metadata']['model'] model_kwargs = metadata['model_metadata']['kwargs'] for kwarg, value in model_kwargs.items(): experiment_data[kwarg] = str(value) except KeyError: pass try: model_config = metadata['hp_settings']['model_config'] experiment_data['model'] = model_config['model'] model_kwargs = model_config['model_kwargs'] for kwarg, value in model_kwargs.items(): experiment_data[f'model_kwarg_{kwarg}'] = str(value) except KeyError: pass best_model_path = experiment / 'best_model' if best_model_path.exists(): try: best_model = load_model(best_model_path) if isinstance(best_model, windpower.mltrain.train.BaseModel): model_metadata = best_model.get_metadata() for k, v in model_metadata.items(): if k == 'args': for i, arg in enumerate(v): experiment_data[f'args_{i}'] = arg elif k == 'kwargs': for kwarg_name, kwarg in v.items(): experiment_data[f'kwarg_{kwarg_name}'] = kwarg else: experiment_data[k] = v try: best_iteration = best_model.best_iteration_ experiment_data['best_iteration'] = best_iteration except AttributeError: pass except ValueError: pass ## best_performance.csv is the best performance on the validation set, if there's a test_predictions.npz, we should use it test_predictions_path = experiment / 'test_predictions.npz' if test_predictions_path.exists(): print("Using test predictions to derive performance data") predictions = np.load(test_predictions_path) x = predictions['x'] with open(experiment / 'artifacts' / 'settings.pkl', 'rb') as fp: settings = pickle.load(fp) model_variables_path = experiment / 'test_variable_definitions.json' with open(model_variables_path) as fp: model_variables = json.load(fp) lead_time_column = model_variables['lead_time'][0] time_of_day_column = model_variables['time_of_day'][0] if time_of_day_column > x.shape[1]: print("Warning, time of day column is higher than number of columns") time_of_day_column = lead_time_column + 1 y = predictions['y'] y_hat = predictions['y_hat'] mae = np.abs(y - y_hat) production_offset = settings.dataset_config.production_offset time_of_day = x[:, time_of_day_column] lead_times = x[:, lead_time_column] + production_offset for lead_time in np.unique(lead_times.astype(np.int)): experiment_data[f'mae_{lead_time:02}'] = mae[lead_times == lead_time].mean() experiment_data['mae'] = mae.mean() else: best_performance_path = experiment / 'best_performance.csv' if best_performance_path.exists(): with open(best_performance_path) as in_fp: best_performance = next( iter(DictReader(in_fp))) # take the first row, it's the only one for k, v in best_performance.items(): experiment_data[k] = v else: raise FileNotFoundError("No performance data found") fold_reference_times_path = experiment.parent / 'fold_reference_times.npz' if fold_reference_times_path.exists(): fold_reference_times = np.load(fold_reference_times_path) training_reference_times = fold_reference_times['train'] test_reference_times = fold_reference_times['test'] experiment_data['n_train_forecasts'] = len(training_reference_times) experiment_data['n_test_forecasts'] = len(test_reference_times) return experiment_data
def __init__(self, env_params, sim_params, network, simulator='traci'): """See parent class.""" for p in OPEN_ENV_PARAMS.keys(): if p not in env_params.additional_params: raise KeyError('Env parameter "{}" not supplied'.format(p)) assert not (env_params.additional_params["warmup_path"] is not None and env_params.additional_params["inflows"] is not None), \ "Cannot assign a value to both \"warmup_paths\" and \"inflows\"" # this is stored to be reused during the reset procedure self._network_cls = network.__class__ self._network_name = deepcopy(network.orig_name) self._network_net_params = deepcopy(network.net_params) self._network_initial_config = deepcopy(network.initial_config) self._network_traffic_lights = deepcopy(network.traffic_lights) self._network_vehicles = deepcopy(network.vehicles) super(AVOpenEnv, self).__init__( env_params=env_params, sim_params=sim_params, network=network, simulator=simulator, ) # Get the paths to all the initial state xml files warmup_path = env_params.additional_params["warmup_path"] if warmup_path is not None: self.warmup_paths = [ f for f in os.listdir(warmup_path) if f.endswith(".xml") ] self.warmup_description = defaultdict(list) for record in DictReader( open(os.path.join(warmup_path, 'description.csv'))): for key, val in record.items(): # or iteritems in Python 2 self.warmup_description[key].append(float(val)) else: self.warmup_paths = None self.warmup_description = None # maximum number of controlled vehicles self.num_rl = env_params.additional_params["num_rl"] # queue of rl vehicles waiting to be controlled self.rl_queue = collections.deque() # names of the rl vehicles controlled at any step self.rl_veh = [] # names of the rl vehicles past the control range self.removed_veh = [] # used for visualization: the vehicles behind and after RL vehicles # (ie the observed vehicles) will have a different color self.leader = [] self.follower = [] # control range, updated to be entire network if not specified self._control_range = \ self.env_params.additional_params["control_range"] or \ [0, self.k.network.length()] # dynamics controller for uncontrolled RL vehicles (mimics humans) controller = self.k.vehicle.type_parameters["human"][ "acceleration_controller"] self._rl_controller = controller[0]( veh_id="rl", car_following_params=self.k.vehicle.type_parameters["human"][ "car_following_params"], **controller[1] ) if isinstance(network, I210SubNetwork): # the name of the final edge, whose speed limit may be updated self._final_edge = "119257908#3" # maximum number of lanes to add vehicles across self._num_lanes = 5 else: # the name of the final edge, whose speed limit may be updated self._final_edge = "highway_end" # maximum number of lanes to add vehicles across self._num_lanes = 1
def chain_samples(dirs, names, group_filename, gff_filename, count_filename, field_to_use='count_fl', fuzzy_junction=0, allow_5merge=False, max_3_diff=100, fastq_filename=None): for d in dirs.values(): sample_sanity_check(os.path.join(d, group_filename),\ os.path.join(d, gff_filename),\ os.path.join(d, count_filename),\ os.path.join(d, fastq_filename) if fastq_filename is not None else None) count_header, count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith('tmp_'): chain = [] for start_i, name in enumerate(names): if name.startswith('tmp_'): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ max_3_diff=max_3_diff, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) #chain.append(name) # no need, already done above else: # everything is new, start fresh name = names[0] d = dirs[name] chain = [name] o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ max_3_diff=max_3_diff, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) start_i = 1 for name in names[start_i:]: assert not name.startswith('tmp_') d = dirs[name] o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ sample_prefix=name, output_prefix='tmp_'+name, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ max_3_diff=max_3_diff, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open('tmp_' + c + '.mega_info.txt'), delimiter='\t'): d['tmp_' + c, r['superPBID']] = r f1 = open('all_samples.chained_ids.txt', 'w') writer1 = DictWriter(f1, fieldnames=['superPBID'] + chain, delimiter='\t') writer1.writeheader() f2 = open('all_samples.chained_count.txt', 'w') writer2 = DictWriter(f2, fieldnames=['superPBID'] + chain, delimiter='\t') writer2.writeheader() reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'), delimiter='\t') for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: 'NA') # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: 'NA') # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != 'NA': answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r['tmp_' + c] == 'NA': saw_NA = True break else: r2 = d['tmp_' + c, r['tmp_' + c]] answer[c] = r2[c] if answer[c] != 'NA': answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != 'NA': answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] rec1 = {'superPBID': r0['superPBID']} rec2 = {'superPBID': r0['superPBID']} for c in chain: rec1[c] = answer[c] rec2[c] = str(answer2[c]) writer1.writerow(rec1) writer2.writerow(rec2) f1.close() f2.close() shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff') if fastq_filename is not None: shutil.copyfile('tmp_' + chain[-1] + '.rep.fq', 'all_samples.chained.rep.fq') print("Chained output written to:", file=sys.stdout) print("all_samples.chained.gff", file=sys.stdout) print(f1.name, file=sys.stdout) print(f2.name, file=sys.stdout) if fastq_filename is not None: print("all_samples.chained.rep.fq", file=sys.stdout)
from csv import DictWriter, DictReader with open('test_csv', 'w') as f: csv_writer = DictWriter(f, fieldnames=['name', 'contact']) csv_writer.writerows([{ 'name': 'omkar', 'contact': '123' }, { 'name': 'surve', 'contact': '456' }]) with open('test_csv', 'r') as t: csv_reader = DictReader(t, fieldnames=['name', 'contact']) for k in csv_reader: print(k['contact'])
plt.show() # This needs to be commented out of user code ''' import sys def report( name, shortd, longd): d = {'Name': name, 'Short': shortd, 'Long': longd} print(str(d)) #Mock data goes first from csv import DictReader # helps with handling csv formatted data from urllib2 import urlopen # helps with pulling data off the web url = 'https://docs.google.com/spreadsheets/d/1_artlzgoj6pDBCBfdt9-Jmc9RT9yLsZ0vTnk3zJmt_E/pub?gid=1291197392&single=true&output=csv' response = urlopen(url) loan_table = [row for row in DictReader(response)] # a mapping function using identity xloan_table = loan_table # in case user screws with loan_table int_scores = [int(row['Credit_History']) for row in xloan_table if row['Credit_History'] != ''] # image dirs target_url1 = '/home/smccumsey/waggle-classroom/waggle/media/course_1/module_9/image_3/challenge3.png' target_url2 = '/home/smccumsey/waggle-classroom/waggle/media/tmp/attempt_6_3.png' import matplotlib.pyplot as plt plt.close() plt.cla() plt.clf()
if len(existing_password) < 8: print("Password too short") continue elif len(existing_password) > 35: print("Password too long") continue elif not existing_password.isalnum(): print("Password must be alphanumerical") continue # check if password/account exist in the system else: with open("user_data.csv", "r") as read_obj: csv_reader = DictReader(read_obj) for row in csv_reader: if row["username"] == existing_user_id and row[ "password"] == existing_password: print("Valid password") print("Welcome back %s, it's been a while" % existing_user_id) break else: print( "Password or username is incorrect try again") break break # if user does not have an account they should create an account by giving their name, email, username and password. user must receive a greeting