def __repr__(self): rep = '' if self.is_continous: rep = str(self.value_list) elif len(self.value_list) == 1: rep = str(self.value_list[0]) else: rep = "[" for idx, val in enumerate(self.value_list): aux = ', ' if idx < len(self.value_list) - 1 else '' if tools.is_numeric(val): rep += str(round(float(val), 2)) + aux else: rep += str(val) + aux rep += "]" return rep
def generate_sql(self): """Generate a SQL file from GTFS feed.""" if self.logfile: out = open(self.logfile, "a") else: out = sys.stdout sqlfile = "" if zipfile.is_zipfile(self.source): # create temp file for SQL output fd, sqlfile = tempfile.mkstemp() tmpfile = os.fdopen(fd, "w") # begin a transaction in SQL file tmpfile.write("SET CLIENT_ENCODING TO %s;\n" % self.encoding) tmpfile.write("SET STANDARD_CONFORMING_STRINGS TO ON;\n") tmpfile.write("BEGIN;\n") # open zip file with zipfile.ZipFile(self.source) as zipf: # map of text file => (mandatory, zip_path) gFiles = {} for f, mandatory in GTFSImporter.GTFSFILES: gFiles[f] = (mandatory, '') for zfile in zipf.namelist(): bn = os.path.basename( zfile ) for f, m in GTFSImporter.GTFSFILES: if f + '.txt' == bn: mandatory, p = gFiles[f] gFiles[f] = ( mandatory, zfile ) for f, v in gFiles.iteritems(): mandatory, f = v if mandatory and f == '': raise ValueError, "Missing file in GTFS archive : %s" % f for f, v in gFiles.iteritems(): mandatory, zpath = v if zpath == '': continue out.write( "== Loading %s\n" % zpath ) # get rid of Unicode BOM (U+FEFF) def csv_cleaner( f ): for line in f: yield line.replace('\xef\xbb\xbf', '') reader = csv.reader(csv_cleaner(zipf.open( zpath )), delimiter = ',', quotechar = '"') # Write SQL for each beginning of table tmpfile.write("-- Inserting values for table %s\n\n" % f) # first row is field names fieldnames = reader.next() if self.copymode: tmpfile.write('COPY "%s"."%s" (%s) FROM stdin;\n' % (IMPORTSCHEMA, f, ",".join(fieldnames))) # read the rows values # deduce value type by testing for row in reader: insert_row = [] for value in row: if value == '': if self.copymode: insert_row.append('\N') else: insert_row.append('NULL') elif not self.copymode and not is_numeric(value): insert_row.append("'%s'" % value.replace("'", "''")) else: insert_row.append(value) # write SQL statement if self.copymode: tmpfile.write("%s\n" % '\t'.join(insert_row)) else: tmpfile.write("INSERT INTO %s.%s (%s) VALUES (%s);\n" %\ (IMPORTSCHEMA, f, ",".join(fieldnames), ','.join(insert_row))) # Write SQL at end of processed table if self.copymode: tmpfile.write("\.\n") tmpfile.write("\n-- Processed table %s.\n\n" % f) tmpfile.write("COMMIT;\n") tmpfile.write("-- Processed all data \n\n") tmpfile.close() return sqlfile
# max iterations to get lasso to converge MAX_ITERATIONS = 10**6 #load the data train_data, test_data = get_train_test_data() # add in dummy variable columns for the zipcodes train_data_dummies = pd.get_dummies(train_data['zipcode']) train_data = pd.concat([train_data, train_data_dummies], axis=1) test_data_dummies = pd.get_dummies(test_data['zipcode']) test_data = pd.concat([test_data, test_data_dummies], axis=1) #get a list of numeric features numeric_features = [] for feature in train_data.columns: if tools.is_numeric(train_data[feature]): numeric_features.append(feature) # remove irrelevant numeric features numeric_features.remove("price") numeric_features.remove("id") numeric_features.remove("zipcode") numeric_features.remove("long") numeric_features.remove("lat") #numeric_features.remove("yr_built") features = numeric_features """ find the best l2 penalty done by finding r squared values for different l2 penalties then select the l2 penalty that gives the highest r squared value """
def __normalizeNumeric(self, value): if not tools.is_numeric(value): # Handle unexpected text addition value = 0 return value
def generate_sql(self): """Generate a SQL file from GTFS feed.""" if self.logfile: out = open(self.logfile, "a") else: out = sys.stdout sqlfile = "" if zipfile.is_zipfile(self.source): # create temp file for SQL output fd, sqlfile = tempfile.mkstemp() tmpfile = os.fdopen(fd, "w") # begin a transaction in SQL file tmpfile.write("SET CLIENT_ENCODING TO %s;\n" % self.encoding) tmpfile.write("SET STANDARD_CONFORMING_STRINGS TO ON;\n") tmpfile.write("BEGIN;\n") # open zip file with zipfile.ZipFile(self.source) as zipf: # map of text file => (mandatory, zip_path) gFiles = {} for f, mandatory in GTFSImporter.GTFSFILES: gFiles[f] = (mandatory, '') for zfile in zipf.namelist(): bn = os.path.basename(zfile) for f, m in GTFSImporter.GTFSFILES: if f + '.txt' == bn: mandatory, p = gFiles[f] gFiles[f] = (mandatory, zfile) for f, v in gFiles.iteritems(): mandatory, f = v if mandatory and f == '': raise ValueError, "Missing file in GTFS archive : %s" % f for f, v in gFiles.iteritems(): mandatory, zpath = v if zpath == '': continue out.write("== Loading %s\n" % zpath) # get rid of Unicode BOM (U+FEFF) def csv_cleaner(f): for line in f: yield line.replace('\xef\xbb\xbf', '') reader = csv.reader(csv_cleaner(zipf.open(zpath)), delimiter=',', quotechar='"') # Write SQL for each beginning of table tmpfile.write("-- Inserting values for table %s\n\n" % f) # first row is field names fieldnames = reader.next() if self.copymode: tmpfile.write('COPY "%s"."%s" (%s) FROM stdin;\n' % (IMPORTSCHEMA, f, ",".join(fieldnames))) # read the rows values # deduce value type by testing for row in reader: insert_row = [] for value in row: if value == '': if self.copymode: insert_row.append('\N') else: insert_row.append('NULL') elif not self.copymode and not is_numeric(value): insert_row.append("'%s'" % value.replace("'", "''")) else: insert_row.append(value) # write SQL statement if self.copymode: tmpfile.write("%s\n" % '\t'.join(insert_row)) else: tmpfile.write("INSERT INTO %s.%s (%s) VALUES (%s);\n" %\ (IMPORTSCHEMA, f, ",".join(fieldnames), ','.join(insert_row))) # Write SQL at end of processed table if self.copymode: tmpfile.write("\.\n") tmpfile.write("\n-- Processed table %s.\n\n" % f) tmpfile.write("COMMIT;\n") tmpfile.write("-- Processed all data \n\n") tmpfile.close() return sqlfile