Exemple #1
0
 def __repr__(self):
     rep = ''
     if self.is_continous:
         rep = str(self.value_list)
     elif len(self.value_list) == 1:
         rep = str(self.value_list[0])
     else:
         rep = "["
         for idx, val in enumerate(self.value_list):
             aux = ', ' if idx < len(self.value_list) - 1 else ''
             if tools.is_numeric(val):
                 rep += str(round(float(val), 2)) + aux
             else:
                 rep += str(val) + aux
         rep += "]"
     return rep
Exemple #2
0
    def generate_sql(self):
        """Generate a SQL file from GTFS feed."""

        if self.logfile:
            out = open(self.logfile, "a")
        else:
            out = sys.stdout

        sqlfile = ""
        if zipfile.is_zipfile(self.source):
            # create temp file for SQL output
            fd, sqlfile = tempfile.mkstemp()
            tmpfile = os.fdopen(fd, "w")
            # begin a transaction in SQL file
            tmpfile.write("SET CLIENT_ENCODING TO %s;\n" % self.encoding)
            tmpfile.write("SET STANDARD_CONFORMING_STRINGS TO ON;\n")
            tmpfile.write("BEGIN;\n")


            # open zip file
            with zipfile.ZipFile(self.source) as zipf:

                # map of text file => (mandatory, zip_path)
                gFiles = {}
                for f, mandatory in GTFSImporter.GTFSFILES:
                    gFiles[f] = (mandatory, '')

                for zfile in zipf.namelist():
                    bn = os.path.basename( zfile )
                    for f, m in GTFSImporter.GTFSFILES:
                        if f + '.txt' == bn:
                            mandatory, p = gFiles[f]
                            gFiles[f] = ( mandatory, zfile )

                for f, v in gFiles.iteritems():
                    mandatory, f = v
                    if mandatory and f == '':
                        raise ValueError, "Missing file in GTFS archive : %s" % f

                for f, v in gFiles.iteritems():
                    mandatory, zpath = v
                    if zpath == '':
                        continue

                    out.write( "== Loading %s\n" % zpath )

                    # get rid of Unicode BOM (U+FEFF)
                    def csv_cleaner( f ):
                        for line in f:
                            yield line.replace('\xef\xbb\xbf', '')

                    reader = csv.reader(csv_cleaner(zipf.open( zpath )),
                                        delimiter = ',',
                                        quotechar = '"')

                    # Write SQL for each beginning of table
                    tmpfile.write("-- Inserting values for table %s\n\n" % f)
                    # first row is field names
                    fieldnames = reader.next()
                    if self.copymode:
                        tmpfile.write('COPY "%s"."%s" (%s) FROM stdin;\n' % (IMPORTSCHEMA, f, ",".join(fieldnames)))
                    # read the rows values
                    # deduce value type by testing
                    for row in reader:
                        insert_row = []
                        for value in row:
                            if value == '':
                                if self.copymode:
                                    insert_row.append('\N')
                                else:
                                    insert_row.append('NULL')
                            elif not self.copymode and not is_numeric(value):
                                insert_row.append("'%s'" % value.replace("'", "''"))
                            else:
                                insert_row.append(value)
                        # write SQL statement
                        if self.copymode:
                            tmpfile.write("%s\n" % '\t'.join(insert_row))
                        else:
                            tmpfile.write("INSERT INTO %s.%s (%s) VALUES (%s);\n" %\
                                    (IMPORTSCHEMA, f, ",".join(fieldnames), ','.join(insert_row)))
                    # Write SQL at end of processed table
                    if self.copymode:
                        tmpfile.write("\.\n")
                    tmpfile.write("\n-- Processed table %s.\n\n" % f)

            tmpfile.write("COMMIT;\n")
            tmpfile.write("-- Processed all data \n\n")
            tmpfile.close()
        return sqlfile
Exemple #3
0
# max iterations to get lasso to converge
MAX_ITERATIONS = 10**6

#load the data
train_data, test_data = get_train_test_data()
# add in dummy variable columns for the zipcodes
train_data_dummies = pd.get_dummies(train_data['zipcode'])
train_data = pd.concat([train_data, train_data_dummies], axis=1)
test_data_dummies = pd.get_dummies(test_data['zipcode'])
test_data = pd.concat([test_data, test_data_dummies], axis=1)

#get a list of numeric features
numeric_features = []
for feature in train_data.columns:
    if tools.is_numeric(train_data[feature]):
        numeric_features.append(feature)
# remove irrelevant numeric features
numeric_features.remove("price")
numeric_features.remove("id")
numeric_features.remove("zipcode")
numeric_features.remove("long")
numeric_features.remove("lat")
#numeric_features.remove("yr_built")

features = numeric_features
"""
find the best l2 penalty
done by finding r squared values for different l2 penalties
then select the l2 penalty that gives the highest r squared value
"""
Exemple #4
0
 def __normalizeNumeric(self, value):
     if not tools.is_numeric(value):
         # Handle unexpected text addition
         value = 0
     return value
Exemple #5
0
    def generate_sql(self):
        """Generate a SQL file from GTFS feed."""

        if self.logfile:
            out = open(self.logfile, "a")
        else:
            out = sys.stdout

        sqlfile = ""
        if zipfile.is_zipfile(self.source):
            # create temp file for SQL output
            fd, sqlfile = tempfile.mkstemp()
            tmpfile = os.fdopen(fd, "w")
            # begin a transaction in SQL file
            tmpfile.write("SET CLIENT_ENCODING TO %s;\n" % self.encoding)
            tmpfile.write("SET STANDARD_CONFORMING_STRINGS TO ON;\n")
            tmpfile.write("BEGIN;\n")

            # open zip file
            with zipfile.ZipFile(self.source) as zipf:

                # map of text file => (mandatory, zip_path)
                gFiles = {}
                for f, mandatory in GTFSImporter.GTFSFILES:
                    gFiles[f] = (mandatory, '')

                for zfile in zipf.namelist():
                    bn = os.path.basename(zfile)
                    for f, m in GTFSImporter.GTFSFILES:
                        if f + '.txt' == bn:
                            mandatory, p = gFiles[f]
                            gFiles[f] = (mandatory, zfile)

                for f, v in gFiles.iteritems():
                    mandatory, f = v
                    if mandatory and f == '':
                        raise ValueError, "Missing file in GTFS archive : %s" % f

                for f, v in gFiles.iteritems():
                    mandatory, zpath = v
                    if zpath == '':
                        continue

                    out.write("== Loading %s\n" % zpath)

                    # get rid of Unicode BOM (U+FEFF)
                    def csv_cleaner(f):
                        for line in f:
                            yield line.replace('\xef\xbb\xbf', '')

                    reader = csv.reader(csv_cleaner(zipf.open(zpath)),
                                        delimiter=',',
                                        quotechar='"')

                    # Write SQL for each beginning of table
                    tmpfile.write("-- Inserting values for table %s\n\n" % f)
                    # first row is field names
                    fieldnames = reader.next()
                    if self.copymode:
                        tmpfile.write('COPY "%s"."%s" (%s) FROM stdin;\n' %
                                      (IMPORTSCHEMA, f, ",".join(fieldnames)))
                    # read the rows values
                    # deduce value type by testing
                    for row in reader:
                        insert_row = []
                        for value in row:
                            if value == '':
                                if self.copymode:
                                    insert_row.append('\N')
                                else:
                                    insert_row.append('NULL')
                            elif not self.copymode and not is_numeric(value):
                                insert_row.append("'%s'" %
                                                  value.replace("'", "''"))
                            else:
                                insert_row.append(value)
                        # write SQL statement
                        if self.copymode:
                            tmpfile.write("%s\n" % '\t'.join(insert_row))
                        else:
                            tmpfile.write("INSERT INTO %s.%s (%s) VALUES (%s);\n" %\
                                    (IMPORTSCHEMA, f, ",".join(fieldnames), ','.join(insert_row)))
                    # Write SQL at end of processed table
                    if self.copymode:
                        tmpfile.write("\.\n")
                    tmpfile.write("\n-- Processed table %s.\n\n" % f)

            tmpfile.write("COMMIT;\n")
            tmpfile.write("-- Processed all data \n\n")
            tmpfile.close()
        return sqlfile