Example #1
0
def metadataInsertion():
    DBconn = DBConnection()
    counter = 1
    print_every = 1000

    for l in parse(R"F:\Dataset\Amazon\Electonics\meta_Electronics.json.gz"):
        if counter % print_every == 0:
            print('[{}] rows data has done.'.format(counter))
        counter += 1

        data = (json.loads(l))

        # 處理名字 (空)
        hasObject = lambda obj: data[obj] if obj in data else None
        hasFloat = lambda obj: data[obj] if obj in data else 'NULL'
        hasStr = lambda text: ("\"{}\"").format(
            RegularExpression('[a-zA-Z0-9,.]+', text)
        ) if text is not None else 'NULL'

        asin: str = data['asin']
        imUrl: str = hasObject('imUrl')
        brand: str = hasObject('brand')
        description: str = hasStr(hasObject('description'))
        title: str = hasStr(hasObject('title'))
        price: float = hasFloat('price')
        related: str = hasObject('related')
        categories: str = hasObject('categories')

        # print('title:\t{}\nbrand:\t{}\nprice:\t{}\nrelated:\t{}\ncategories:\t{}\n============================'.
        #     format(title,brand,price,related,categories))

        insertSql = (
            "INSERT INTO `metadata` ( \n" + "`asin`, \n" + "`title`, \n" +
            "`description`, \n" + "`price`, \n" + "`imUrl`, \n" +
            "`related`, \n" + "`categories`) \n" +
            "VALUES (\"{}\",{},{},{},\"{}\",\"{}\",\"{}\");".format(
                asin, title, description, price, imUrl, related, categories))

        print(insertSql)
        DBconn.Insertion(insertSql)
    DBconn.connection.close()
Example #2
0
def ReviewInsertion():
    DBconn = DBConnection()
    counter = 1
    print_every = 1000

    for l in parse(
            R"F:\Dataset\Amazon\Electonics\reviews_Electronics_5.json.gz"):
        if counter % print_every == 0:
            print('[{}] rows data has done.'.format(counter))
        counter += 1

        data = (json.loads(l))
        helpfulRating = lambda m, n: n if n == 0 else (m / n)

        # 處理名字 (空)
        hasObject = lambda obj: data[obj] if obj in data else None
        hasStr = lambda text: ("\"{}\"").format(
            RegularExpression('[a-zA-Z0-9.]+', text)
        ) if text is not None else 'NULL'

        reviewerID: str = data['reviewerID']
        asin: str = data['asin']
        reviewerName: str = hasStr(hasObject('reviewerName'))
        helpful: float = helpfulRating(data['helpful'][0], data['helpful'][1])
        reviewText: str = RegularExpression('[a-zA-Z0-9]+', data['reviewText'])
        overall: float = data['overall']
        summary: str = RegularExpression('[a-zA-Z0-9]+', data['summary'])
        unixReviewTime: str = data['unixReviewTime']
        reviewTime: str = data['reviewTime']

        insertSql = (
            "INSERT INTO `review` ( \n" + "`reviewerID`, \n" + "`asin`, \n" +
            "`reviewerName`, \n" + "`helpful`, \n" + "`reviewText`, \n" +
            "`overall`, \n" + "`summary`, \n" + "`unixReviewTime`, \n" +
            "`reviewTime` ) \n" + "VALUES \n" +
            "( \"{}\",\"{}\",{},{},\"{}\",{},\"{}\",\"{}\",\"{}\");".format(
                reviewerID, asin, reviewerName, helpful, reviewText, overall,
                summary, unixReviewTime, reviewTime))
        DBconn.Insertion(insertSql)
    DBconn.connection.close()
                                          reviewerid[index:RANGE],
                                          productid_list, embedding))
        processes.append(p)
        p.start()

    for process in processes:
        process.join()

    print('Finish.')


#%%
if __name__ == "__main__":

    print('sql select start.')
    conn = DBConnection()

    productid = SearchProduct(conn)
    reviewerid = SearchReviewer(conn)
    reviewerdata = SearchReviewerData(conn)

    conn.close()
    print('sql select finish.')

    productid_list = [row['asin'] for row in productid]

    # # Initialize embeddings
    # hidden_size = 300
    # embedding = nn.Embedding(len(productid_list), hidden_size)

    main(reviewerdata, reviewerid, productid_list)
def loadData(havingCount=15, LIMIT=5000, testing=False, table=''):

    print('Loading asin/reviewerID from cav file...')
    asin, reviewerID = Read_Asin_Reviewer(table)
    print('Loading asin/reviewerID complete.')

    # asin/reviewerID to index
    itemObj = item()
    itemObj.addItem(asin)
    userObj = user()
    userObj.addUser(reviewerID)

    st = time.time()
    print('Loading dataset from database...')

    if (not testing):
        # load training user data

        ## this sql select base on metadata
        # sql = (
        #     'WITH tenReviewsUp AS ( ' +
        #     '		SELECT reviewerID ' +
        #     '		FROM {}review '.format(table) +
        #     '		group by reviewerID ' +
        #     '		HAVING COUNT(reviewerID) >= {} '.format(havingCount) +
        #     '		limit {} '.format(LIMIT) +
        #     '	) ' +
        #     'SELECT ' +
        #     'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' +
        #     '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) +
        #     'FROM {}review , {}metadata '.format(table, table) +
        #     'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' +
        #     'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) +
        #     'ORDER BY reviewerID,unixReviewTime ASC ;'
        # )

        sql = (
            'WITH tenReviewsUp AS ( ' + '		SELECT reviewerID ' +
            '		FROM {}review '.format(table) + '		group by reviewerID ' +
            '		HAVING COUNT(reviewerID) >= {} '.format(havingCount) +
            '		limit {} '.format(LIMIT) + '	) ' + 'SELECT ' +
            'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, '
            +
            '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '
            .format(table, table, table, table, table, table) +
            'FROM {}review '.format(table) +
            'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' +
            'ORDER BY reviewerID,unixReviewTime ASC ;')

    else:
        sql = (
            'WITH tenReviewsUp AS ( ' + '		SELECT reviewerID ' +
            '		FROM {}review '.format(table) + 'WHERE reviewerID  NOT IN ' +
            '( ' + '   SELECT reviewerID ' +
            '   FROM {}training_2000 '.format(table) + ') ' +
            '		group by reviewerID ' +
            '		HAVING COUNT(reviewerID) >= {} '.format(havingCount) +
            '		limit {} '.format(LIMIT) + '	) ' + 'SELECT ' +
            'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, '
            +
            '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '
            .format(table, table, table, table, table, table) +
            'FROM {}review , {}metadata '.format(table, table) +
            'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' +
            'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) +
            'ORDER BY reviewerID,unixReviewTime ASC ;')

    print(sql)

    conn = DBConnection()
    res = conn.selection(sql)
    conn.close()

    print('Loading complete. [{}]'.format(time.time() - st))

    return res, itemObj, userObj
from DBconnector import DBConnection
from nltk.corpus import stopwords
import nltk
import tqdm

if __name__ == "__main__":
    sql = "select `ID`, reviewText FROM toys_interaction6;"

    conn = DBConnection()
    res = conn.selection(sql)

    for row in tqdm.tqdm(res):
        uid = row['ID']
        reviewText = row['reviewText']
        filtered_words = [
            word for word in reviewText.split(' ')
            if word not in stopwords.words('english')
        ]
        filtered_sentences = " ".join(filtered_words)

        # print(reviewText)
        # print(filtered_sentences)

        insertsql = ("INSERT INTO toys_interaction6_rm_sw " +
                     "(`ID`, reviewText) VALUES ({},'{}');".format(
                         uid, filtered_sentences))
        conn.Insertion(insertsql)

        stop = 1

    conn.close()
    def loadData(self, havingCount = 15, LIMIT=5000, testing=False, table='', withOutTable='', through_table=False, sqlfile=''):

        print('Loading asin/reviewerID from cav file...')
        asin, reviewerID = self.Read_Asin_Reviewer(table)
        print('Loading asin/reviewerID complete.')

        # asin/reviewerID to index
        itemObj = item()
        itemObj.addItem(asin)
        userObj = user()
        userObj.addUser(reviewerID)

        st = time.time()
        print('Loading dataset from database...') 

        if(not testing):
            # load training user data

            sql = (
                'WITH tenReviewsUp AS ( ' +
                '		SELECT reviewerID ' +
                '		FROM {}review '.format(table) +
                '		group by reviewerID ' +
                '		HAVING COUNT(reviewerID) >= {} '.format(havingCount) +
                '		limit {} '.format(LIMIT) +
                '	) ' +
                'SELECT ' +
                'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' +
                '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) +
                'FROM {}review '.format(table) +
                'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' +
                'ORDER BY reviewerID,unixReviewTime ASC ;'
            )

        else:
            if(through_table):
                sql = (
                    'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, '+
                    'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '+
                    'FROM clothing_realtime8_interaction8, clothing_review '+
                    'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` '+
                    'AND `No` <=22000 '+
                    'AND `No` >20000 '+
                    'ORDER BY `No` ;'                    
                )
            else:
                sql = (
                    'WITH tenReviewsUp AS ( ' +
                    '		SELECT reviewerID ' +
                    '		FROM {}review '.format(table) +
                    'WHERE reviewerID  NOT IN ' +
                    '( ' +
                    '   SELECT reviewerID ' +
                    '   FROM {} '.format(withOutTable) +
                    ') ' +
                    '		group by reviewerID ' +
                    '		HAVING COUNT(reviewerID) >= {} '.format(havingCount) +
                    '		limit {} '.format(LIMIT) +
                    '	) ' +
                    'SELECT ' +
                    'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, ' +
                    '{}review.`ID`, {}review.reviewerID , {}review.`asin`, {}review.overall, {}review.reviewText, {}review.unixReviewTime '.format(table, table, table, table, table, table) +
                    'FROM {}review , {}metadata '.format(table, table) +
                    'WHERE reviewerID IN (SELECT * FROM tenReviewsUp) ' +
                    'AND {}review.`asin` = {}metadata.`asin` '.format(table, table) +
                    'ORDER BY reviewerID,unixReviewTime ASC ;'
                )

        if(sqlfile!='' and not testing):
            with open(sqlfile) as file_:
                sql = file_.read().split(';')[0]
        else:
            with open(sqlfile) as file_:
                sql = file_.read().split(';')[1]
        print(sql)

        conn = DBConnection()
        res = conn.selection(sql)
        conn.close()

        print('Loading complete. [{}]'.format(time.time()-st))
        
        return res, itemObj, userObj
Example #7
0
    def loadData(self,
                 havingCount=15,
                 LIMIT=5000,
                 testing=False,
                 table='',
                 withOutTable='',
                 through_table=False):

        print('Loading asin/reviewerID from cav file...')
        asin, reviewerID = self.Read_Asin_Reviewer(table)
        print('Loading asin/reviewerID complete.')

        # asin/reviewerID to index
        itemObj = item()
        itemObj.addItem(asin)
        userObj = user()
        userObj.addUser(reviewerID)

        st = time.time()
        print('Loading dataset from database...')

        if (not testing):
            # load training user data
            if (through_table):
                sql = (
                    'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, '
                    +
                    'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '
                    +
                    'FROM clothing_realtime8_interaction8, clothing_review ' +
                    'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` '
                    + 'AND `No` <=10000 ' + 'ORDER BY `No` ;')
            else:
                sql = (
                    'SELECT ' +
                    'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, '
                    +
                    'clothing_review.`ID`, clothing_review.reviewerID , clothing_review.`asin`, '
                    +
                    'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '
                    + 'FROM clothing_review, clothing_realtime_at6 ' +
                    'WHERE clothing_review.reviewerID = clothing_realtime_at6.reviewerID '
                    +
                    'AND clothing_review.unixReviewTime = clothing_realtime_at6.unixReviewTime '
                    +
                    'AND clothing_review.reviewerID IN (SELECT reviewerID FROM clothing_realtime_at6_training_1000)  '
                    + 'ORDER BY reviewerID,unixReviewTime ASC ;')

        else:
            if (through_table):
                sql = (
                    'SELECT clothing_realtime8_interaction8.*, clothing_review.`asin`, '
                    +
                    'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '
                    +
                    'FROM clothing_realtime8_interaction8, clothing_review ' +
                    'WHERE clothing_realtime8_interaction8.`ID`=clothing_review.`ID` '
                    + 'AND `No` <=22000 ' + 'AND `No` >20000 ' +
                    'ORDER BY `No` ;')
            else:
                sql = (
                    'SELECT ' +
                    'RANK() OVER (PARTITION BY reviewerID ORDER BY unixReviewTime,ID ASC) AS rank, '
                    +
                    'clothing_review.`ID`, clothing_review.reviewerID , clothing_review.`asin`, '
                    +
                    'clothing_review.overall, clothing_review.reviewText, clothing_review.unixReviewTime '
                    + 'FROM clothing_review, clothing_realtime_at6 ' +
                    'WHERE clothing_review.reviewerID = clothing_realtime_at6.reviewerID '
                    +
                    'AND clothing_review.unixReviewTime = clothing_realtime_at6.unixReviewTime '
                    +
                    'AND clothing_review.reviewerID IN (SELECT reviewerID FROM clothing_realtime_at6_testing_200)  '
                    + 'ORDER BY reviewerID,unixReviewTime ASC ;')

        print(sql)

        conn = DBConnection()
        res = conn.selection(sql)
        conn.close()

        print('Loading complete. [{}]'.format(time.time() - st))

        return res, itemObj, userObj