def insert(db, table, data=None, quiet=False): # Make `data` keys lowercase data = [ dict([ (k.lower(), stringify_if_unicode(row[k])) for k in row ]) for row in data ] # FastFail if data is None or not any([ any(record.values()) for record in data ]): query = "INSERT INTO {} VALUES ();".format(table) else: # FastFail if len(data) == 0: return None data_columns = set([]) for row in data: if row: for key in row.keys(): data_columns.add(key.lower()) # print 'data', data # print 'data columns', data_columns # print 'table', table table_columns = get_columns(table) # print 'table columns', table_columns columns = data_columns & table_columns columns = list(columns) # print 'columns', columns values = [] for row in data: valrow = [] allblank = True for key in columns: # print row if row and key.lower() in row and row[key.lower()] is not None: valrow.append(row[key]) allblank = False else: valrow.append('NULL') # if allblank: # values.append('-- Empty Record') else: values.append(str(tuple(valrow))) query = 'INSERT INTO {} ( `{}` ) VALUES {};' joiner = ', ' if len(data) > 1: query = 'INSERT INTO {} ( `{}` ) VALUES \n\t{};' joiner = ',\n\t' values = joiner.join(values) query = query.format(table,'`, `'.join(columns),values) query = query.replace("'NULL'","NULL") query = query.replace(",)",")") MainQuery = db.cursor() LastIn = db.cursor() lastin = 'SELECT * FROM {} ORDER BY created_at DESC LIMIT {};'.format(table, len(data)) if not quiet: print query print lastin MainQuery.execute(query) LastIn.execute(lastin) return lodify(LastIn)
def __init__(self, db, name): self.db = db self.name = name get_columns = self.db.cursor() query = "DESCRIBE {table};".format(table=self.name) print query get_columns.execute(query) get_columns.close() # pretty(get_columns) self.columns = lodify(get_columns)
def select_or_insert(db, table, **kwery): if 'quiet' in kwery: quiet = kwery['quiet'] del kwery['quiet'] else: quiet = False cursor = db.cursor() query = ', '.join([ '`'+kv[0]+'`='+stringify(kv[1]) for kv in kwery.items() ]) query = 'SELECT * from {} WHERE {} LIMIT 1;'.format(table, query) if not quiet: print query cursor.execute(query) if cursor.rowcount: print 'yes' return lodify(cursor) else: print 'no' return insert(db, table, [kwery], quiet=quiet)
def insertlod(self, lod): """Insert a List Of Dicts into table and return a List Of Dicts""" cursor = self.InsertLOD(lod) result = remove_access(lodify(cursor)) cursor.close() return result
def sim(self, field, values): cursor = self.SIM(field, values) result = remove_access(lodify(cursor)) cursor.close() self.closeall() return result
def select_or_insert(self, **data): cursor = self.SelectOrInsert(mand=data, opt={}) result = remove_access(lodify(cursor))[0] cursor.close() return result
def select(self, **kwery): cursor = self.Select(**kwery) result = remove_access(lodify(cursor)) cursor.close() return result
def mine(url, cid, regex=r'^.*$', wid=None, quiet=False): soup = get_soup(url) if soup: if wid is None: widCursor = Webpage.SelectOrInsert(mand={'url':url}, opt={'newCID':cid}) wid = lodify(widCursor)['WID'] widCursor.close() else: SetNewCID = db.cursor() query = 'UPDATE Webpage SET newCID = {cid} WHERE WID = {wid};' query = query.format(cid=cid, wid=wid) print query SetNewCID.execute(query) SetNewCID.close() # Record the mined data oid = Observation.insert1( WID=wid, CID=cid, # html=unicode(soup.text), # quiet=True, )['OID'] parse_text(soup.text, oid) breakpoint(cid) consolidate_all_webpages() breakpoint(cid) # Record that this link has been mined Update = db.cursor() query = 'UPDATE Webpage SET mined=True WHERE wid IN ({});'.format(str(wid)) if not quiet: print query Update.execute(query) Update.close() links = get_links(soup, url) links = set(filter(lambda link: re.match(regex, link), links)) links = [ link.replace('"','%22') for link in links ] if links: Pages = Webpage.SIM('url', links) pages = remove_access(lodify(Pages)) Pages.close() SetNewCID = db.cursor() query = 'UPDATE Webpage SET newCID={newCID} WHERE access=TRUE;' query = query.format(newCID=cid) SetNewCID.execute(query) Webpage.closeall() Link.insertlod([ {'fromWID':wid,'toWID':row['WID']} for row in pages ]) breakpoint(cid) # UpdateWebpage = db.cursor() # query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format(cid=CrawlID, wids=','.join(discovered_wids)) # if not quiet: # print query # UpdateWebpage.execute(query) # UpdateWebpage.close() # exit() # if links: # SelectWID = db.cursor() # sqlinks = ','.join([ stringify(link.replace('"','%22')) for link in links ]) # query = 'SELECT WID FROM Webpage WHERE url IN ({});'.format(sqlinks) # print query # SelectWID.execute(query) # print # already = set(SelectWID) # # pretty(SelectWID) # SelectWID.close() # links - already # exit() # Link.insertlod([ {'fromWID':wid,'toWID':row} for row in links ]) # if links: # SelectURL = db.cursor() # sqlinks = ','.join([ stringify(link) for link in links ]) # query = 'SELECT url FROM Webpage WHERE url IN ({});'.format(sqlinks) # if not quiet: # print query # SelectURL.execute(query) # SelectURL.close() # already = set([ row[0] for row in cursor ]) # links -= already # if links: # sqlinks = ','.join([ stringify(link) for link in links ]) # Webpage.insert([ {'url':str(url),'newCID':cid} for url in links ]) return soup