Example #1
0
 def __init__(self, stock_num):
     self.mongo = MongodbAPI()
     self.stock_num = stock_num
     self.htmlreq = HtmlRequests()
     self.req = self.htmlreq.get_session(SESSIONURL)
     now = datetime.now()
     self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10)
Example #2
0
 def __init__(self, stock_num: str, start_year: int, start_month: int):
     self.mongo = MongodbAPI()
     self.stock_num = stock_num
     self.htmlreq = HtmlRequests()
     self.req = self.htmlreq.get_session(SESSIONURL)
     self.req.keep_alive = False
     self.start_year = start_year
     self.start_month = start_month
     self.now_date = datetime.now()
     self.retry = 0
Example #3
0
class Money_link():
    def __init__(self):
        self.mongo = MongodbAPI()

    def start(self, stock_num: str) -> list:
        source_url = MONEYLINKURL % (stock_num)
        return self.parser(stock_num, source_url)

    def parser(self, stock_num, url) -> list:
        daily = []
        htmlparser = HtmlRequests()
        tree = htmlparser.get_html(url)
        if tree == None:
            return daily
        now = datetime.now()
        for i in tree.xpath('//div[@id="TickHeight"]/table/tr'):
            time = i.xpath('td[1]/text()')[0]
            buying = i.xpath('td[2]/text()')[0]
            selling = i.xpath('td[3]/text()')[0]
            if buying == '--' or selling == '--':
                continue
            transaction = i.xpath('td[4]/text()')[0]
            tmp_ups_and_downs = i.xpath('td[5]/text()')[0].split(" ")
            ups_and_downs = ""
            if len(tmp_ups_and_downs) < 2:
                ups_and_downs = "0.0"
            elif tmp_ups_and_downs[0] == "▼":
                ups_and_downs = "-" + tmp_ups_and_downs[1]
            elif tmp_ups_and_downs[0] == "▲":
                ups_and_downs = tmp_ups_and_downs[1]
            stock_volume = i.xpath('td[6]/text()')[0]
            time_tmp = time.split(':')
            date = datetime(now.year, now.month, now.day, int(time_tmp[0]),
                            int(time_tmp[1]), int(time_tmp[2]))
            if self.mongo.CheckExists("Transaction_details",
                                      str(stock_num) + "@" + date.isoformat()):
                continue
            daily.append({
                '_id': stock_num + "@" + date.isoformat(),
                'ts': int(date.timestamp()),
                'stock': stock_num,
                'date': date,
                'buying': float(buying),
                'selling': float(selling),
                'transaction': float(transaction),
                'ups_and_downs': float(ups_and_downs),
                'stock_volume': int(stock_volume)
            })
        return daily
Example #4
0
def main():
    stocks = [
        '3455', '5443', '8064', '2409', '1504', '3535', '2397', '2316', '2392',
        '2888', '2385', '2337', '3406', '2492', '2478', '6182', '8163', '2337',
        '2481', '3016', '6153', '3630', '4190'
    ]
    m = MongodbAPI()

    cp = Crawl_Proxy()
    cp.start()

    # opts, args = [], []
    # try:
    #     opts, args = getopt.getopt(sys.argv[1:], 'rDdipe', [])
    # except getopt.GetoptError as err:
    #     logging.error(err)
    #     sys.exit(2)
    # for opt, args in opts:
    #     if opt in ("-r"):
    #         # Realtime Parser End
    #         logging.info("start realtime parser")
    #         threads = []
    #         thread_num = len(stocks)
    #         for i in range(thread_num):
    #             threads.append(threading.Thread(
    #                 target=twse_realtime, args=(stocks[i],)))
    #             threads[i].start()
    #         for i in range(thread_num):
    #             threads[i].join()
    #             logging.info("Thread Done")
    #         # Realtime Parser End
    #     elif opt == "-D":
    #         logging.info("start daily parser")
    #         threads = []
    #         thread_num = len(stocks)
    #         for i in range(thread_num):
    #             threads.append(threading.Thread(
    #                 target=twse_daily, args=(stocks[i], 2007, 1)))
    #             threads[i].start()
    #         for i in range(thread_num):
    #             threads[i].join()
    #             logging.info("Thread Done")
    #     elif opt == "-d":
    #
    #     elif opt == "-i":
    #         date_list = create_random_date()
    #         threads = []
    #         thread_num = 16
    #         for i in range(thread_num):
    #             threads.append(threading.Thread(
    #                 target=tse_institutional_investors, args=(date_list,)))
    #             threads[i].start()
    #         for i in range(thread_num):
    #             threads[i].join()
    #             logging.info("Thread Done")
    #     elif '-p' in opt and '-e' in opt:

    # else:
    #     close_proxy()

    cli_parser = _build_parser()
    args = cli_parser.parse_args()

    if args.d:
        # 每日交易明細
        logging.info("start daily detail stock")
        threads = []
        thread_num = len(stocks)
        ml = Money_link()
        for i in range(thread_num):
            threads.append(
                threading.Thread(target=money_link, args=(
                    m,
                    ml,
                    stocks[i],
                )))
            threads[i].start()
        for i in range(thread_num):
            threads[i].join()
            logging.info("Thread Done")
        # 每日交易明細 結束
    if args.ep:
        date_list = create_random_date()
        threads = []
        thread_num = 16
        for i in range(thread_num):
            threads.append(
                threading.Thread(target=tse_daily_price_earning,
                                 args=(date_list, )))
            threads[i].start()
        for i in range(thread_num):
            threads[i].join()
            logging.info("Thread Done")

    close_proxy()
Example #5
0
 def __init__(self):
     self.source_url = CYBERSYNDROME
     self.mongo = MongodbAPI()
Example #6
0
class Crawl_Proxy(object):
    def __init__(self):
        self.source_url = CYBERSYNDROME
        self.mongo = MongodbAPI()

    def start(self):
        data = self.mongo.Get_Data_From("proxy", {'_id': 0})
        if data is not None and datetime.now() - timedelta(
                hours=3) < data["update_date"] and len(data["iptable"]) > 250:
            logging.info("Use old proxies")
            return
        logging.info("start crawl proxy")
        self.mongo.DropAll("proxy")
        proxy_ip = self.paresrHTML()
        self.mongo.Insert_Data_To("proxy", {
            "_id": 0,
            "iptable": proxy_ip,
            "update_date": datetime.now()
        })
        logging.info("add %04d ip" % (len(proxy_ip)))

    def paresrHTML(self):
        p = HtmlRequests()
        tree = p.get_html_noproxy(self.source_url)
        _as = []
        _ps = []
        if tree == None:
            return
        for i in tree.xpath('//div[@id="content"]/script/text()'):
            result = re.findall('\[[0-9 ,]*\]', i)
            _as = result[0].replace("[", '').replace("]", '')
            _ps = result[1].replace("[", '').replace("]", '')
            _as_list = [x for x in _as.split(',')]
            _ps_list = [x for x in _ps.split(',')]
            arithmetic = re.findall('\(.*?\)%\d*', i)
            n = self.decode(_ps_list, arithmetic[0])
            _as = _as_list[n:] + _as_list[0:n]
            break
        headerlist = []
        for i in tree.xpath('//tr'):
            headers = {}
            for j in i.xpath('td[6]/text()'):
                tmp = j.split(":")
                headers[tmp[0]] = tmp[1]
            headerlist.append(headers)
        return self.getproxy(_as, _ps_list, headerlist)

    def decode(self, ps, string):
        divisor = string.split(')')[1].replace('%', '')
        dividend = string.split(')')[0].replace('(', '')
        num = 0
        for i in dividend.split('+'):
            if "*" in i:
                mult = 1
                for k in i.split('*'):
                    if "ps" in k:
                        count = int(re.search('\d+', k).group(0))
                        mult *= int(ps[count])
                    else:
                        mult *= int(k)
                num += mult
            else:
                if "ps" in i:
                    count = int(re.search('\d+', i).group(0))
                    num += int(ps[count])
                else:
                    num += int(i)
        return num % int(divisor)

    def getproxy(self, _as, _ps, headerlist):
        proxy_ip = []
        j = 0
        ip = ""
        for i in range(len(_as)):
            if i % 4 == 3:
                ip += _as[i]
                proxy_ip.append({
                    'ip': {
                        'http': ip + ':' + _ps[j]
                    },
                    'headers': headerlist[j]
                })
                j += 1
                ip = ""
                continue
            ip += _as[i] + '.'
        return proxy_ip
Example #7
0
 def __init__(self, date: datetime):
     self.mongo = MongodbAPI()
     self.htmlreq = HtmlRequests()
     self.__date = date
     pass
Example #8
0
class Institutional_investors():
    def __init__(self, date: datetime):
        self.mongo = MongodbAPI()
        self.htmlreq = HtmlRequests()
        self.__date = date
        pass

    def start(self):
        date = self.__date.strftime("%Y%m%d")
        source_url = TSELEGALPERSON.format(date=date)
        self.__crawl(source_url, self.__date.strftime("%Y/%m/%d"))
        pass

    def __crawl(self, url, date):
        json_data = self.htmlreq.get_json(requests, source_url=url)
        if json_data.get('stat', None) != "OK":
            logging.debug("This day not Opening :%s" % (date))
            return
        data = self.__parser(json_data, date)
        err = self.mongo.Insert_Many_Data_To("stock_information", data)
        if err:
            logging.info("Insert Institutional investors to mongo , date: %s",
                         date)
        else:
            logging.warn(
                "Fail to Insert Institutional investors to mongo , url: %s",
                url)

    def __parser(self, j, date) -> list:
        data = []
        for i in j['data']:
            i = [x.replace(',', '') for x in i]
            if len(j['fields']) == 12:
                data.append({
                    '_id':
                    str(i[0]) + "@" + date,
                    'date':
                    datetime.strptime(date, "%Y/%m/%d"),
                    'stock_num':
                    str(i[0]),
                    'foreign_investment_dealer_buy':
                    float(i[2]),
                    'foreign_investment_dealer_sell':
                    float(i[3]),
                    'foreign_investment_dealer_net_buy_sell':
                    float(i[4]),
                    'institutional_investors_net_buy_sell':
                    float(i[5]),
                    'investment_trust_buy':
                    float(i[6]),
                    'investment_trust_sell':
                    float(i[7]),
                    'investment_trust_net_buy_sell':
                    float(i[8]),
                    'dealer_buy(Self-purchase)':
                    float(i[9]),
                    'dealer_sell(Self-purchase)':
                    float(i[10]),
                    'dealer_net_buy_sell':
                    float(i[11]),
                })
            elif len(j['fields']) < 18:
                data.append({
                    '_id':
                    str(i[0]) + "@" + date,
                    'date':
                    datetime.strptime(date, "%Y/%m/%d"),
                    'stock_num':
                    str(i[0]),
                    'foreign_investment_buy':
                    float(i[2]),
                    'foreign_investment_sell':
                    float(i[3]),
                    'foreign_investment_net_buy_sell':
                    float(i[4]),
                    'foreign_investment_dealer_buy':
                    float(i[5]),
                    'foreign_investment_dealer_sell':
                    float(i[6]),
                    'foreign_investment_dealer_net_buy_sell':
                    float(i[7]),
                    'investment_trust_buy':
                    float(i[8]),
                    'investment_trust_sell':
                    float(i[9]),
                    'investment_trust_net_buy_sell':
                    float(i[10]),
                    'dealer_net_buy_sell':
                    float(i[11]),
                    'dealer_buy':
                    float(i[12]),
                    'dealer_sell':
                    float(i[13]),
                    'institutional_investors_net_buy_sell':
                    float(i[14]),
                })
            elif len(j['fields']) == 18:
                data.append({
                    '_id':
                    str(i[0]) + "@" + date,
                    'date':
                    datetime.strptime(date, "%Y/%m/%d"),
                    'stock_num':
                    str(i[0]),
                    'foreign_investment_buy':
                    float(i[2]),
                    'foreign_investment_sell':
                    float(i[3]),
                    'foreign_investment_net_buy_sell':
                    float(i[4]),
                    'foreign_investment_dealer_buy':
                    float(i[5]),
                    'foreign_investment_dealer_sell':
                    float(i[6]),
                    'foreign_investment_dealer_net_buy_sell':
                    float(i[7]),
                    'investment_trust_buy':
                    float(i[8]),
                    'investment_trust_sell':
                    float(i[9]),
                    'investment_trust_net_buy_sell':
                    float(i[10]),
                    'dealer_net_buy_sell':
                    float(i[11]),
                    'dealer_buy(Self-purchase)':
                    float(i[12]),
                    'dealer_sell(Self-purchase)':
                    float(i[13]),
                    'dealer_net_buy_sell(Self-purchase)':
                    float(i[14]),
                    'dealer_buy(Hedging)':
                    float(i[15]),
                    'dealer_sell(Hedging)':
                    float(i[16]),
                    'dealer_net_buy_sell(Hedging)':
                    float(i[17]),
                    'institutional_investors_net_buy_sell':
                    float(i[18]),
                })
        return data
Example #9
0
class Daily_stock_info(object):
    def __init__(self, date):
        self.__mongo = MongodbAPI()
        self.__htmlreq = HtmlRequests()
        self.__date = date
        pass

    def start(self):
        date = self.__date.strftime("%Y%m%d")
        source_url = DAILYSTOCKINFO.format(date=date)
        data = self.__crawl(source_url, self.__date.strftime("%Y/%m/%d"))
        if data != None:
            err = self.__mongo.Insert_Many_Data_To('stock_daily_info', data)
            if err:
                logging.info("Insert stock daily info to mongo , date: %s",
                             date)
            else:
                logging.warn(
                    "Fail to Insert stock daily info to mongo , url: %s",
                    source_url)
        return

    def __crawl(self, url, date):
        for i in range(10):
            j = self.__htmlreq.get_json(requests, url)
            if j == {} or j['stat'] != 'OK':
                return None

            rows = []
            if 'data5' in j:
                rows = [
                    x for x in j['data5'] if len(x[0]) == 4 and x[-1] != '0.00'
                ]
            elif 'data4' in j:
                rows = [
                    x for x in j['data4'] if len(x[0]) == 4 and x[-1] != '0.00'
                ]
            else:
                logging.warn("The daily info not have data5 or data4 url: %s",
                             url)
                return None
            data = self.__parser(date, rows)
            return data
        else:
            logging.error("Fail to parser daily stock info , url: %s", url)

    def __parser(self, date, rows: list) -> list:
        data = []
        for i in rows:
            data.append({
                '_id':
                i[0] + "@" + date,
                'stock':
                i[0],
                'date':
                datetime.strptime(date, "%Y/%m/%d"),
                'ts':
                int(datetime.timestamp(datetime.strptime(date, "%Y/%m/%d"))),
                'transaction':
                float(i[3].replace(',', '')),
                'open':
                self.__get_float(i[5]),
                'high':
                self.__get_float(i[6]),
                'low':
                self.__get_float(i[7]),
                'close':
                self.__get_float(i[8]),
                'change':
                self.__get_sign_float(i[9], i[10]),
                'price_earning':
                float(i[-1].replace(',', '')),
            })
        return data

    def __get_sign_float(self, sign, num) -> float:
        if "-" in sign:
            return float("-" + num)
        elif "+" in sign:
            return float(num)
        else:
            return 0.0

    def __get_float(self, num) -> float:
        if num.replace(',', '') == 'X0.00':
            return 0.0
        elif num == '--':
            return None
        else:
            return float(num.replace(',', ''))
Example #10
0
 def __init__(self):
     self.mongo = MongodbAPI()
Example #11
0
class TWSE_realtime():
    def __init__(self, stock_num):
        self.mongo = MongodbAPI()
        self.stock_num = stock_num
        self.htmlreq = HtmlRequests()
        self.req = self.htmlreq.get_session(SESSIONURL)
        now = datetime.now()
        self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10)

    def start(self):
        self.crawl()

    def crawl(self):
        now = datetime.now()
        if now < self.stop_date:
            threading.Timer(5.0, self.crawl).start()
        now_time = int(time.time()) * 1000
        source_url = TWSEREALTIMEURL.format(
            stock_num=self.stock_num, time=now_time)
        json_data = self.htmlreq.get_json(self.req, source_url)
        data = self.parser(json_data)
        if data == None:
            return
        e = self.mongo.CheckExists('Realtime_data', data.get('_id', None))
        if e == False:
            for i in range(5):
                err = self.mongo.Insert_Data_To("Realtime_data", data)
                if err:
                    logging.info("Insert realtime data to mongo, id:%s" %
                                 (data.get("_id")))
                    return
            else:
                logging.error(
                    "Fail to insert realtime data to mongo, id:%s" % (data.get("_id")))

    def parser(self, j: json):
        # Process best result
        if len(j['msgArray']) == 0:
            return None
        data = j['msgArray'][0]

        def _split_best(d):
            if d:
                return d.strip('_').split('_')
            return d

        time = datetime.fromtimestamp(
            int(data['tlong']) / 1000).strftime('%Y-%m-%d %H:%M:%S')
        date = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
        return {
            "_id": str(self.stock_num) + "@"+time,
            "code": self.stock_num,
            'ts': int(time.mktime(date.timetuple())),
            "time": date,
            "latest_trade_price": float(data.get('z', None)),
            "trade_volume": float(data.get('tv', None)),
            "accumulate_trade_volume": float(data.get('v', None)),
            "best_bid_price": [float(x) for x in _split_best(data.get('b', None))],
            "best_bid_volume": [float(x) for x in _split_best(data.get('g', None))],
            "best_ask_price": [float(x) for x in _split_best(data.get('a', None))],
            "best_ask_volume": [float(x) for x in _split_best(data.get('f', None))],
            "open": float(data.get('o', None)),
            "high": float(data.get('h', None)),
            "low": float(data.get('l', None))
        }
Example #12
0
class TWSE_daily():
    def __init__(self, stock_num: str, start_year: int, start_month: int):
        self.mongo = MongodbAPI()
        self.stock_num = stock_num
        self.htmlreq = HtmlRequests()
        self.req = self.htmlreq.get_session(SESSIONURL)
        self.req.keep_alive = False
        self.start_year = start_year
        self.start_month = start_month
        self.now_date = datetime.now()
        self.retry = 0

    def start(self):
        now_year = self.start_year
        now_month = self.start_month
        self.crawl(now_year, now_month)

    def crawl(self, year, month):
        logging.debug("%s/%s" % (year, month))
        source_url = TWSEREALTIMEURL.format(
            stock_num=self.stock_num, time="%d%02d01" % (year, month))
        json_data = self.htmlreq.get_json(self.req, source_url)
        if json_data == {} and self.retry < 5:
            self.retry += 1
            self.crawl(year, month)
        else:
            logging.error("Can't get old daily stock %s@%s-%s ,url : %s " %
                          (self.stock_num, year, month, source_url))
        self.retry = 0
        data = self.parser(json_data.get('data', None))

        if data != None and len(data) > 0:
            for i in range(5):
                err = self.mongo.Insert_Many_Data_To("Daily_data", data)
                if err == True:
                    logging.info("Insert Daily data %s@%s-%s" %
                                 (self.stock_num, year, month))
                    break
            else:
                logging.error("Fail, Insert Daily data to Mongo,id: %s@%s-%s" %
                              (self.stock_num, year, month))
        date = self._get_next_date(year, month)
        if date['year'] >= self.now_date.year and date['month'] > self.now_date.month:
            logging.info("Done crawl Daily data , %s@%s/%s" %
                         (self.stock_num, date['year'], date['month']))
            return
        # Start to crawl new year, month
        self.crawl(date["year"], date["month"])

    def _convert_date(self, date):
        """Convert '106/05/01' to '2017/05/01'"""
        return '/'.join([str(int(date.split('/')[0]) + 1911)] + date.split('/')[1:])

    def parser(self, j: json) -> list:
        data = []
        if j == None:
            return data
        for item in j:
            date = datetime.strptime(
                self._convert_date(item[0]), '%Y/%m/%d')
            _id = self.stock_num + "@"+date.strftime("%Y/%m/%d")

            e = self.mongo.CheckExists(
                "Daily_data", _id)
            if e:
                logging.debug("Insert Daily data ,id :%s exists" %
                              (_id))
                continue
            try:
                data.append({
                    '_id': _id,
                    'stock': self.stock_num,
                    'date': date,
                    'ts': int(time.mktime(date.timetuple())),
                    'capacity': int(item[1].replace(',', '')),
                    'turnover': int(item[2].replace(',', '')),
                    'open': self._get_float(item[3]),
                    'high': self._get_float(item[4]),
                    'low': self._get_float(item[5]),
                    'close':  self._get_float(item[6]),
                    'change':  self._get_float(item[7]),
                    'transaction': int(item[8].replace(',', ''))
                })
            except Exception as e:
                logging.error("daily data fail :%s %s" % (item, e))
                continue
        return data

    def _get_next_date(self, year, month) -> dict:
        if month < 12:
            month += 1
        else:
            year += 1
            month = 1
        return {
            'year': year,
            'month': month
        }

    def _get_float(self, number: str):
        if number.replace(',', '') == 'X0.00':
            return 0.0
        elif number == '--':
            return None
        else:
            return float(number.replace(',', ''))
        return