def clean(value): """ Strip out characters that are not allowed in files in some OS's """ value = asciiDammit(value.encode('iso-8859-1')) return re.sub(r'[*|\/:"<>?]', '', value)
def reportHelper(localAppDatadir, run_id, app_name): # list all files in the report folder dirname, dirnames, filenames = next(os.walk(localAppDatadir+'/'+run_id)) filepaths = ["file://localhost/"+dirname+"/"+z for z in filenames ] # identify all png files in the directory and encode it into database images = [x for x in filenames if str(x).endswith('.png')] imagepaths = [dirname+"/"+x for x in images] imagetags = [] for ipath in imagepaths: data_uri = open(ipath, 'rb').read().encode('base64').replace('\n', '') img_tag = '<img src="data:image/png;base64,{0}">'.format(data_uri) imagetags.append(img_tag) # identify waypoint databases in the folder databases = [dirname+'/'+x for x in filenames if str(x).endswith('waypoint.sqlite') ] dbTables = collections.OrderedDict() colnames = {} if databases: for db in databases: conn = sqlite3.connect(db) c = conn.cursor() c.execute("SELECT name FROM sqlite_master WHERE type='table';") tblNms = sorted([tblNm[0] for tblNm in c.fetchall()]) # reorder tblNms according to tableOrder x = [d for d in configDict['applications'] if d['appName'] == app_name][0] if x and 'tableOrder' in x.keys(): tableOrder = x['tableOrder'] tn_in_db = [] for tn in tableOrder: if tn in tblNms: tn_in_db.append(tn) tblNms.remove(tn) tblNms = tn_in_db + tblNms tblTags= ["#%s"%tblNm for tblNm in tblNms] # Iterate over individual tables and retrieve the row data for display for tblNm in tblNms: rowcount = [row for row in c.execute("SELECT count(*) row_count FROM %s"%tblNm)][0][0] if rowcount < 500: rows = c.execute('select * from %s'%tblNm) # force ascii conversion for display colnames[tblNm] = [asciiDammit(description[0]) for description in c.description] dbTables[tblNm] = [[wpu.renderHtmlTableCell(x) for x in row] for row in rows] conn.close() return render_template('report.html', dbpaths=databases, run_id=run_id, tableNames=tblTags, filenames=filenames, filepaths=filepaths, imagetags=imagetags, dbTables=dbTables, colnames=colnames, app_name=app_name)
for row in contribution_reader : try: c.execute('INSERT INTO raw_table VALUES ' '(?, ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?)', row[0:29]) except sqlite3.ProgrammingError: try: c.execute('INSERT INTO raw_table VALUES ' '(?, ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?, ' ' ?, ?, ?, ?, ?, ?, ?)', [asciiDammit(field) for field in row[0:29]]) except: print "failed to import row" print row raise conn.commit() print 'creating donors table...' c.execute("CREATE TABLE donors " "(donor_id INTEGER PRIMARY KEY, first_name TEXT, " " last_name TEXT, address_1 TEXT, address_2 TEXT, " " city TEXT, state TEXT, zip TEXT)") c.execute("INSERT INTO donors " "(first_name, last_name, address_1,"
def get_text(jd): txt = jd.get('text', "") if txt: return asciiDammit(txt) else: return ""
def preProcess(column) : column = asciiDammit(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower() return column
violations text, x_coord real, y_coord real, lat real, long real, location text)''') c.execute('''CREATE TABLE inspections_clean (inspection_id int, dba text, aka text, license_no int, facility_type text, risk text, address text, zip text, inspection_date date, inspection_type text, results int, violations text, lat real, long real)''') conn.commit() # Insert data into raw table with open('../data/food_inspections.csv', 'rb') as f : reader = csv.reader(f) reader.next() for row in reader: values = [asciiDammit(field) for field in row] c.execute('''INSERT INTO inspections_raw VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', values) conn.commit() fields = ('inspection_id', 'dba', 'aka', 'license_no', 'facility_type', 'risk', 'address', 'zip', 'inspection_date', 'inspection_type', 'results', 'violations', 'lat', 'long')
from Corpus import Corpus from Document import Document import io, os from AsciiDammit import asciiDammit import numpy as np from scipy.sparse import csc_matrix, lil_matrix from scipy.spatial import distance from sklearn.preprocessing import normalize #1-grams through n-grams will be compilated n_gram_length = 4 corpus_title = "inaugural/all.txt" corp_text = asciiDammit(open(corpus_title).read()) corp = Corpus(corp_text, corpus_title, n_gram_length) #Just a mapping of the vocabulary to the natural numbers vocab_map = {} for index, key in enumerate(corp.n_gram_count): vocab_map[key] = index #List of inaugural speech document objects speeches = [] #index 0 is .DS_STore folder = "inaugural" for file in os.listdir(folder)[1:]: doc_name = folder + "/" + file doc_text = open(doc_name, "r").read() doc_text = asciiDammit(doc_text) speeches.append(Document(doc_text, file, corp, n_gram_length))
violations text, x_coord real, y_coord real, lat real, long real, location text)''') c.execute('''CREATE TABLE inspections_clean (inspection_id int, dba text, aka text, license_no int, facility_type text, risk text, address text, zip text, inspection_date date, inspection_type text, results int, violations text, lat real, long real)''') conn.commit() # Insert data into raw table with open('../data/food_inspections.csv', 'rb') as f: reader = csv.reader(f) reader.next() for row in reader: values = [asciiDammit(field) for field in row] c.execute( '''INSERT INTO inspections_raw VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', values) conn.commit() fields = ('inspection_id', 'dba', 'aka', 'license_no', 'facility_type', 'risk', 'address', 'zip', 'inspection_date', 'inspection_type', 'results', 'violations', 'lat', 'long') # Load data to clean it