def metadataInsertion(): DBconn = DBConnection() counter = 1 print_every = 1000 for l in parse(R"F:\Dataset\Amazon\Electonics\meta_Electronics.json.gz"): if counter % print_every == 0: print('[{}] rows data has done.'.format(counter)) counter += 1 data = (json.loads(l)) # 處理名字 (空) hasObject = lambda obj: data[obj] if obj in data else None hasFloat = lambda obj: data[obj] if obj in data else 'NULL' hasStr = lambda text: ("\"{}\"").format( RegularExpression('[a-zA-Z0-9,.]+', text) ) if text is not None else 'NULL' asin: str = data['asin'] imUrl: str = hasObject('imUrl') brand: str = hasObject('brand') description: str = hasStr(hasObject('description')) title: str = hasStr(hasObject('title')) price: float = hasFloat('price') related: str = hasObject('related') categories: str = hasObject('categories') # print('title:\t{}\nbrand:\t{}\nprice:\t{}\nrelated:\t{}\ncategories:\t{}\n============================'. # format(title,brand,price,related,categories)) insertSql = ( "INSERT INTO `metadata` ( \n" + "`asin`, \n" + "`title`, \n" + "`description`, \n" + "`price`, \n" + "`imUrl`, \n" + "`related`, \n" + "`categories`) \n" + "VALUES (\"{}\",{},{},{},\"{}\",\"{}\",\"{}\");".format( asin, title, description, price, imUrl, related, categories)) print(insertSql) DBconn.Insertion(insertSql) DBconn.connection.close()
def ReviewInsertion(): DBconn = DBConnection() counter = 1 print_every = 1000 for l in parse( R"F:\Dataset\Amazon\Electonics\reviews_Electronics_5.json.gz"): if counter % print_every == 0: print('[{}] rows data has done.'.format(counter)) counter += 1 data = (json.loads(l)) helpfulRating = lambda m, n: n if n == 0 else (m / n) # 處理名字 (空) hasObject = lambda obj: data[obj] if obj in data else None hasStr = lambda text: ("\"{}\"").format( RegularExpression('[a-zA-Z0-9.]+', text) ) if text is not None else 'NULL' reviewerID: str = data['reviewerID'] asin: str = data['asin'] reviewerName: str = hasStr(hasObject('reviewerName')) helpful: float = helpfulRating(data['helpful'][0], data['helpful'][1]) reviewText: str = RegularExpression('[a-zA-Z0-9]+', data['reviewText']) overall: float = data['overall'] summary: str = RegularExpression('[a-zA-Z0-9]+', data['summary']) unixReviewTime: str = data['unixReviewTime'] reviewTime: str = data['reviewTime'] insertSql = ( "INSERT INTO `review` ( \n" + "`reviewerID`, \n" + "`asin`, \n" + "`reviewerName`, \n" + "`helpful`, \n" + "`reviewText`, \n" + "`overall`, \n" + "`summary`, \n" + "`unixReviewTime`, \n" + "`reviewTime` ) \n" + "VALUES \n" + "( \"{}\",\"{}\",{},{},\"{}\",{},\"{}\",\"{}\",\"{}\");".format( reviewerID, asin, reviewerName, helpful, reviewText, overall, summary, unixReviewTime, reviewTime)) DBconn.Insertion(insertSql) DBconn.connection.close()
reviewerid[index:RANGE], productid_list, embedding)) processes.append(p) p.start() for process in processes: process.join() print('Finish.') #%% if __name__ == "__main__": print('sql select start.') conn = DBConnection() productid = SearchProduct(conn) reviewerid = SearchReviewer(conn) reviewerdata = SearchReviewerData(conn) conn.close() print('sql select finish.') productid_list = [row['asin'] for row in productid] # # Initialize embeddings # hidden_size = 300 # embedding = nn.Embedding(len(productid_list), hidden_size) main(reviewerdata, reviewerid, productid_list)
def loadData(havingCount=15, LIMIT=5000, testing=False, table=''): print('Loading asin/reviewerID from cav file...') asin, reviewerID = Read_Asin_Reviewer(table) print('Loading asin/reviewerID complete.') # asin/reviewerID to index itemObj = item() itemObj.addItem(asin) userObj = user() userObj.addUser(reviewerID) st = time.time() print('Loading dataset from database...') if (not testing): # load training user data ## this sql select base on metadata # sql = ( # 'WITH tenReviewsUp AS ( ' + # ' SELECT reviewerID ' + # ' FROM {}review '.format(table) + # ' group by reviewerID ' + # ' HAVING COUNT(reviewerID) >= {} '.format(havingCount) + # ' limit {} '.format(LIMIT) + # ' ) ' + # 'SELECT ' + # 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + # '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) + # 'FROM {}review , {}metadata '.format(table, table) + # 'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' + # 'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) + # 'ORDER BY reviewerID,unixReviewTime ASC ;' # ) sql = ( 'WITH tenReviewsUp AS ( ' + ' SELECT reviewerID ' + ' FROM {}review '.format(table) + ' group by reviewerID ' + ' HAVING COUNT(reviewerID) >= {} '.format(havingCount) + ' limit {} '.format(LIMIT) + ' ) ' + 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime ' .format(table, table, table, table, table, table) + 'FROM {}review '.format(table) + 'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' + 'ORDER BY reviewerID,unixReviewTime ASC ;') else: sql = ( 'WITH tenReviewsUp AS ( ' + ' SELECT reviewerID ' + ' FROM {}review '.format(table) + 'WHERE reviewerID NOT IN ' + '( ' + ' SELECT reviewerID ' + ' FROM {}training_2000 '.format(table) + ') ' + ' group by reviewerID ' + ' HAVING COUNT(reviewerID) >= {} '.format(havingCount) + ' limit {} '.format(LIMIT) + ' ) ' + 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime ' .format(table, table, table, table, table, table) + 'FROM {}review , {}metadata '.format(table, table) + 'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' + 'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) + 'ORDER BY reviewerID,unixReviewTime ASC ;') print(sql) conn = DBConnection() res = conn.selection(sql) conn.close() print('Loading complete. [{}]'.format(time.time() - st)) return res, itemObj, userObj
from DBconnector import DBConnection from nltk.corpus import stopwords import nltk import tqdm if __name__ == "__main__": sql = "select `ID`, reviewText FROM toys_interaction6;" conn = DBConnection() res = conn.selection(sql) for row in tqdm.tqdm(res): uid = row['ID'] reviewText = row['reviewText'] filtered_words = [ word for word in reviewText.split(' ') if word not in stopwords.words('english') ] filtered_sentences = " ".join(filtered_words) # print(reviewText) # print(filtered_sentences) insertsql = ("INSERT INTO toys_interaction6_rm_sw " + "(`ID`, reviewText) VALUES ({},'{}');".format( uid, filtered_sentences)) conn.Insertion(insertsql) stop = 1 conn.close()
def loadData(self, havingCount = 15, LIMIT=5000, testing=False, table='', withOutTable='', through_table=False, sqlfile=''): print('Loading asin/reviewerID from cav file...') asin, reviewerID = self.Read_Asin_Reviewer(table) print('Loading asin/reviewerID complete.') # asin/reviewerID to index itemObj = item() itemObj.addItem(asin) userObj = user() userObj.addUser(reviewerID) st = time.time() print('Loading dataset from database...') if(not testing): # load training user data sql = ( 'WITH tenReviewsUp AS ( ' + ' SELECT reviewerID ' + ' FROM {}review '.format(table) + ' group by reviewerID ' + ' HAVING COUNT(reviewerID) >= {} '.format(havingCount) + ' limit {} '.format(LIMIT) + ' ) ' + 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) + 'FROM {}review '.format(table) + 'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' + 'ORDER BY reviewerID,unixReviewTime ASC ;' ) else: if(through_table): sql = ( 'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, '+ 'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '+ 'FROM clothing_realtime8_interaction8, clothing_review '+ 'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` '+ 'AND `No` <=22000 '+ 'AND `No` >20000 '+ 'ORDER BY `No` ;' ) else: sql = ( 'WITH tenReviewsUp AS ( ' + ' SELECT reviewerID ' + ' FROM {}review '.format(table) + 'WHERE reviewerID NOT IN ' + '( ' + ' SELECT reviewerID ' + ' FROM {} '.format(withOutTable) + ') ' + ' group by reviewerID ' + ' HAVING COUNT(reviewerID) >= {} '.format(havingCount) + ' limit {} '.format(LIMIT) + ' ) ' + 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) + 'FROM {}review , {}metadata '.format(table, table) + 'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' + 'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) + 'ORDER BY reviewerID,unixReviewTime ASC ;' ) if(sqlfile!='' and not testing): with open(sqlfile) as file_: sql = file_.read().split(';')[0] else: with open(sqlfile) as file_: sql = file_.read().split(';')[1] print(sql) conn = DBConnection() res = conn.selection(sql) conn.close() print('Loading complete. [{}]'.format(time.time()-st)) return res, itemObj, userObj
def loadData(self, havingCount=15, LIMIT=5000, testing=False, table='', withOutTable='', through_table=False): print('Loading asin/reviewerID from cav file...') asin, reviewerID = self.Read_Asin_Reviewer(table) print('Loading asin/reviewerID complete.') # asin/reviewerID to index itemObj = item() itemObj.addItem(asin) userObj = user() userObj.addUser(reviewerID) st = time.time() print('Loading dataset from database...') if (not testing): # load training user data if (through_table): sql = ( 'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, ' + 'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime ' + 'FROM clothing_realtime8_interaction8, clothing_review ' + 'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` ' + 'AND `No` <=10000 ' + 'ORDER BY `No` ;') else: sql = ( 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + 'clothing_review.`ID`, clothing_review.reviewerID , clothing_review.`asin`, ' + 'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime ' + 'FROM clothing_review, clothing_realtime_at6 ' + 'WHERE clothing_review.reviewerID = clothing_realtime_at6.reviewerID ' + 'AND clothing_review.unixReviewTime = clothing_realtime_at6.unixReviewTime ' + 'AND clothing_review.reviewerID IN (SELECT reviewerID FROM clothing_realtime_at6_training_1000) ' + 'ORDER BY reviewerID,unixReviewTime ASC ;') else: if (through_table): sql = ( 'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, ' + 'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime ' + 'FROM clothing_realtime8_interaction8, clothing_review ' + 'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` ' + 'AND `No` <=22000 ' + 'AND `No` >20000 ' + 'ORDER BY `No` ;') else: sql = ( 'SELECT ' + 'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' + 'clothing_review.`ID`, clothing_review.reviewerID , clothing_review.`asin`, ' + 'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime ' + 'FROM clothing_review, clothing_realtime_at6 ' + 'WHERE clothing_review.reviewerID = clothing_realtime_at6.reviewerID ' + 'AND clothing_review.unixReviewTime = clothing_realtime_at6.unixReviewTime ' + 'AND clothing_review.reviewerID IN (SELECT reviewerID FROM clothing_realtime_at6_testing_200) ' + 'ORDER BY reviewerID,unixReviewTime ASC ;') print(sql) conn = DBConnection() res = conn.selection(sql) conn.close() print('Loading complete. [{}]'.format(time.time() - st)) return res, itemObj, userObj