Ejemplo n.º 1
0
 def __init__(self,
              id=None,
              updatedAt=None,
              sessionId=None,
              commandExecutor=None):
     self.id = id
     self.updatedAt = getGivenOrDefault(
         DateTimeUtil.forcedlyGetDateTime(updatedAt),
         DateTimeUtil.dateTimeNow())
     self.sessionId = sessionId
     self.commandExecutor = commandExecutor
Ejemplo n.º 2
0
 def __init__(self):
     try:
         print(DateTimeUtil.get_current_time(), "Start Connection DB...")
         self.db = MySQLdb.connect('localhost', 'root', 'chensy123',
                                   'Auction_Spider_GPai', 3306)
         self.cur = self.db.cursor()
         print(DateTimeUtil.get_current_time(),
               "Connection DB Successful...")
     except MySQLdb.Error as e:
         print(DateTimeUtil.get_current_time(),
               "Connection DB Failure: %d: %s" % (e.args[0], e.args[1]))
Ejemplo n.º 3
0
 def select(self, select_sql):
     try:
         self.db.set_character_set('utf8')
         self.cur.execute(select_sql)
         return list(self.cur._rows)
     except MySQLdb.Error as e:
         print(DateTimeUtil.get_current_time(),
               "DB Select Failure: %d: %s" % (e.args[0], e.args[1]))
     except Exception as ex:
         print(DateTimeUtil.get_current_time(),
               "DB Select Failure: %d: %s" % (e.args[0], e.args[1]))
Ejemplo n.º 4
0
 def __init__(self, database_table):
     try:
         print(DateTimeUtil.get_current_time(), "Start Connection DB...")
         self.db = MySQLdb.connect('localhost', 'root', 'chensy123',
                                   database_table, 3306)
         self.cur = self.db.cursor()
         print(DateTimeUtil.get_current_time(),
               "Connection DB Successful...",
               str(threading.currentThread().ident))
     except MySQLdb.Error as e:
         print(DateTimeUtil.get_current_time(),
               "Connection DB Failure: %d: %s" % (e.args[0], e.args[1]),
               str(threading.currentThread().ident))
 def spider_auctions(self, court_list, is_auction_history, process_order, auction_processes):
     print(DateTimeUtil.get_current_time() + ThreadUtil.get_thread_id_process_order(process_order) + " Start Craw...")
     mysql_instance = MySQL.MySQL('auction_spider_ali')
     categories = mysql_instance.get_categories()
     statuses = mysql_instance.get_statuses(is_auction_history)
     count = 0
     for court_order in range(len(court_list)):
         court=court_list[court_order]
         # print('Thread Id: ' + str(threading.currentThread().ident), end='')
         # print(court)
         if int(court[4]) == 0:
             # print(court[2] + ": no auction")
             continue
         else:
             # print(court[2] + ": has auctions, start to craw")
             url_auctions_list_raw = 'https://sf.taobao.com/' + str(court[1]) + '/' + str(court[2])
             user_id = self.get_user_id(url_auctions_list_raw)
             for category in categories:
                 category_id = category[1]
                 for status in statuses:
                     if status[3] == 0:
                         continue
                     url_auctions_list = 'https://sf.taobao.com/court_item.htm?user_id=' + user_id + '&category=' + category_id + '&sorder=' + str(status[1])
                     if url_auctions_list in auction_processes:
                         print(DateTimeUtil.get_current_time() + ThreadUtil.get_thread_id_process_order(process_order) + " URL has been processed: " + url_auctions_list)
                         continue
                     mysql_instance.upsert_auction_process(url_auctions_list)
                     total_count = self.get_total_count(url_auctions_list)
                     if total_count > 0:
                         # get page count
                         each_page_count = 20
                         page_total = total_count // each_page_count
                         if total_count % each_page_count != 0:
                             page_total += 1
                         page_total = self.get_page_total(total_count, 20)
                         # process url to get html and insert
                         # print('total count: ' + str(total_count) + ' page count: ' + str(page_total))
                         for page_number in range(1, page_total + 1):
                             print(DateTimeUtil.get_current_time() + ThreadUtil.get_thread_id_process_order(process_order) + "Process Craw...", "courts " + str(court_order + 1) + "/" + str(len(court_list)) + " page " + str(page_number) + "/" + str(page_total))
                             url = url_auctions_list + '&page=' + str(page_number)
                             count += self.spider_auction_list_and_insert(url, user_id, category_id, status[1], mysql_instance, is_auction_history)
                     mysql_instance.upsert_auction_process(url_auctions_list)
     # print(court[2] + ": spider finish with count: " + str(count))
     print(DateTimeUtil.get_current_time() + "spider finish with count: " + str(count) + ThreadUtil.get_thread_id_process_order(process_order))
     print(DateTimeUtil.get_current_time() + "Finish Craw..." + ThreadUtil.get_thread_id_process_order(process_order))
Ejemplo n.º 6
0
 def __init__(self, database_table):
     try:
         #print(DateTimeUtil.get_current_time(), "Start Connection DB...", ThreadUtil.get_thread_id())
         self.db = pymysql.connect('localhost', 'root', 'chensy123',
                                   database_table, 3306)
         self.cur = self.db.cursor()
         #print(DateTimeUtil.get_current_time(), "Connect DB Successfully...", ThreadUtil.get_thread_id())
     except pymysql.Error as e:
         print(
             DateTimeUtil.get_current_time(),
             "Connect DB Failure e: %d %d" %
             (e.args[0], ThreadUtil.get_thread_id()))
         #print(DateTimeUtil.get_current_time(), "Connection DB Failure e: %d: %s" % (e.args[0], e.args[1]), str(threading.currentThread().ident))
     except Exception as ex:
         print(
             DateTimeUtil.get_current_time(),
             "Connection DB Failure ex: %d: %s" % (ex.args[0], ex.args[1]),
             ThreadUtil.get_thread_id())
Ejemplo n.º 7
0
 def upsert(self, insert_sql_check, insert_sql, update_sql):
     try:
         self.db.set_charset('gbk')
         self.cur.execute(insert_sql_check)
         if self.cur._rows[0][0] == 0:
             try:
                 result = self.cur.execute(insert_sql)
                 insert_id = self.db.insert_id()
                 self.db.commit()
                 if result:
                     return insert_id
                 else:
                     return 0
             except pymysql.Error as e:
                 self.db.rollback()
                 if "key 'PRIMARY'" in e.args[1]:
                     print(DateTimeUtil.get_current_time(),
                           "Primary Key Exists")
                 else:
                     print(DateTimeUtil.get_current_time(),
                           "Insert Failure: %s" % (insert_sql))
                     print(
                         DateTimeUtil.get_current_time(),
                         "Insert Failure: %d: %s" % (e.args[0], e.args[1]))
                     exit()
         else:
             result = self.cur.execute(update_sql)
             insert_id = self.db.insert_id()
             self.db.commit()
             if result:
                 return insert_id
             else:
                 return 0
     except pymysql.Error as e:
         print(insert_sql)
         print(update_sql)
         print(DateTimeUtil.get_current_time(),
               "Upsert Failure: %d: %s" % (e.args[0], e.args[1]))
         exit()
     except Exception as ex:
         print(insert_sql)
         print(update_sql)
         print(DateTimeUtil.get_current_time(),
               "DB Object Error: %d: %s" % (ex.args[0], ex.args[1]))
Ejemplo n.º 8
0
 def __init__(self,
              id=None,
              key=None,
              conversationKey=None,
              ownerKey=None,
              ownerInfo=None,
              postedAt=None,
              scannedAt=None,
              createdAt=None,
              updatedAt=None,
              text=None,
              originalAsText=None,
              originalAsHtml=None,
              isPoolerMessage=None,
              poolingStatus=None,
              errorCount=None,
              errorListAsJson=None):
     now = DateTimeUtil.dateTimeNow()
     self.id = id
     self.key = getGivenOrDefault(key, getNewErrorId())
     self.ownerKey = getGivenOrDefault(ownerKey,
                                       MessageConstants.UNKNOWN_OWNER)
     self.ownerInfo = ownerInfo
     self.conversationKey = conversationKey
     self.postedAt = getGivenOrDefault(
         DateTimeUtil.forcedlyGetDateTime(postedAt), now)
     self.scannedAt = getGivenOrDefault(
         DateTimeUtil.forcedlyGetDateTime(scannedAt), now)
     self.createdAt = getGivenOrDefault(
         DateTimeUtil.forcedlyGetDateTime(createdAt), now)
     self.updatedAt = getGivenOrDefault(
         DateTimeUtil.forcedlyGetDateTime(updatedAt), now)
     self.text = text
     self.originalAsText = originalAsText
     self.originalAsHtml = originalAsHtml
     self.isPoolerMessage = getGivenOrDefault(
         isPoolerMessage, MessageConstants.DEFAULT_IS_POOLER_MESSAGE)
     self.poolingStatus = getGivenOrDefault(
         poolingStatus, MessageConstants.DEFAULT_POOLING_STATUS)
     self.errorCount = getGivenOrDefault(
         errorCount, MessageConstants.DEFAULT_ERROR_COUNT)
     self.errorListAsJson = errorListAsJson
                            page_total = self.get_page_total(total_count, 20)
                            # process url to get html and insert
                            # print('total count: ' + str(total_count) + ' page count: ' + str(page_total))
                            for page_number in range(1, page_total + 1):
                                print(DateTimeUtil.get_current_time() + ThreadUtil.get_thread_id_process_order(process_order) + "Process Craw...", "courts " + str(court_order + 1) + "/" + str(len(court_list)) + " page " + str(page_number) + "/" + str(page_total))
                                url = url_auctions_list + '&page=' + str(page_number)
                                count += self.spider_auction_list_and_insert(url, user_id, category_id, status[1], mysql_instance, is_auction_history)
                        mysql_instance.upsert_auction_process(url_auctions_list)
        # print(court[2] + ": spider finish with count: " + str(count))
        print(DateTimeUtil.get_current_time() + "spider finish with count: " + str(count) + ThreadUtil.get_thread_id_process_order(process_order))
        print(DateTimeUtil.get_current_time() + "Finish Craw..." + ThreadUtil.get_thread_id_process_order(process_order))


if __name__ == '__main__':
    auctionSpiderGPai = AuctionSpiderGPai()
    print(DateTimeUtil.get_current_time() + " Start Main Progress--------------------------------------------------------------")
    # prepare
    print(DateTimeUtil.get_current_time(), "Get Courts Start--------------------------------------------------------------")
    mysql = MySQL.MySQL('auction_spider_ali')
    courts = mysql.get_courts()
    auction_process_all = mysql.query_auction_process_all()
    auction_processes = []
    for i in range(len(auction_process_all)):
        auction_processes.append(auction_process_all[i][1])
    print(DateTimeUtil.get_current_time(), "Get Courts End--------------------------------------------------------------")
    process_count = 100
    each_count = len(courts) // process_count
    # start multiple thread
    process_array = {}
    start_time = time.time()
    is_auction_history = False
Ejemplo n.º 10
0
                                page_total += 1
                            page_total = self.get_page_total(total_count, 20)
                            # process url to get html and insert
                            # print('total count: ' + str(total_count) + ' page count: ' + str(page_total))
                            for page_number in range(1, page_total + 1):
                                url = url_auctions_list + '&page=' + str(page_number)
                                count += self.spider_auction_list_and_insert(url, user_id, category_id, status[1], mysql_instance, is_auction_history)
        # print(court[2] + ": spider finish with count: " + str(count))
        print("spider finish with count: ", str(count))
        print("Finish Craw..." + str(threading.currentThread().ident))


if __name__ == '__main__':
    auctionSpiderGPai = AuctionSpiderJD()
    mysql = MySQL.MySQL('auction_spider_ali')
    print(DateTimeUtil.get_current_time() + " start main progress")
    # prepare
    courts = mysql.get_courts()
    thread_count = 4
    each_count = len(courts) // thread_count
    # start multiple thread
    thread_array = {}
    start_time = time.time()
    is_auction_history = False
    for tid in range(thread_count):
        # t = Thread(target=auctionSpiderGPai.spider_auctions, args=(courts[tid:(tid+1)],))
        print('count: ' + str(len(courts[tid*each_count:(tid+1)*each_count])), end='')
        print(courts[tid*each_count:(tid+1)*each_count])
        t = Thread(target=auctionSpiderGPai.spider_auctions, args=(courts[tid*each_count:(tid+1)*each_count], is_auction_history, ))
        t.start()
        thread_array[tid] = t
import csv
import sys
import Constants
import QuandlApiUtil
import FileIOUtil
import DateTimeUtil

baseDir = Constants.BASE_DIR
feedsDir = Constants.FEED_DIR
activeTickersFileName = Constants.ACTIVE_TICKERS_FILE_NAME

#Updates ticker history for list of tickers from last update date to last business date

last_business_day = str(DateTimeUtil.get_last_business_day())
exchange = sys.argv[1]
#exchange = 'NYSE'
tickersFileUrl = baseDir + "/" + feedsDir + "/" + exchange + "/" + activeTickersFileName
print "Reading " + tickersFileUrl


def get_ticker_eod_for_missing_days(ticker, num_days_of_eod_data_missing):
    json = QuandlApiUtil.get_ticker_eod_data(ticker,
                                             num_days_of_eod_data_missing)
    csvData = FileIOUtil.convertJsonToCsv(json)
    return csvData


with open(tickersFileUrl, 'rU') as csvfile:
    tickersReader = csv.reader(csvfile, delimiter=',', quotechar='\"')
    for row in tickersReader:
        last_eod_update_date = None
Ejemplo n.º 12
0
 def upsert_auction_process(self, auction_process_url):
     insert_check = "SELECT COUNT(*) FROM Auction_Processes WHERE URL = '" + auction_process_url + "'"
     insert_sql = "INSERT INTO Auction_Processes(URL, IsFinished, CreatedOn) VALUES('" + auction_process_url + "', FALSE, '" + DateTimeUtil.get_current_datetime(
     ) + "')"
     update_sql = "UPDATE Auction_Processes SET IsFinished = TRUE WHERE URL = '" + auction_process_url + "'"
     self.upsert(insert_check, insert_sql, update_sql)