def crawlDivident(code):
    link = "https://stock.xueqiu.com/v5/stock/f10/us/bonus.json?symbol=%s&size=10000&page=1&extend=true" % code
    session = HTMLSession()
    r = session.get(link, headers=HEADERS, cookies=COOKIES)

    content = json.dumps(json.loads(r.content))
    path = "C:/project/stockdata/USDivident/%s.json" % code
    write2File(path, content)
    FileLogger.info("get divident of code: %s in size: %d" %
                    (code, len(content)))
def crawlCashflow(code):
    link = "https://stock.xueqiu.com/v5/stock/finance/us/cash_flow.json?symbol=%s&type=all&is_detail=true&count=1000&timestamp=1616585707592" % code
    session = HTMLSession()
    r = session.get(link, headers=HEADERS, cookies=COOKIES)

    content = json.dumps(json.loads(r.content))
    path = "C:/project/stockdata/USCashflow/%s.json" % code
    write2File(path, content)
    FileLogger.info("get cashflow of code: %s in size: %d" %
                    (code, len(content)))
Beispiel #3
0
def crawlUSStocks():
    # 查询语句:select ts_code from usstock.stocklist; 
    stockList = pd.read_csv("C:/project/Tushare/usstock/code.csv").to_numpy()

    for code in stockList:
        FileLogger.info("running on code: " + code[0])
        try:
            crawlHistory(code[0])
            time.sleep(1)
        except Exception as ex:
            FileLogger.error(ex)
            FileLogger.error("crawl error on code: %s" % code)
            time.sleep(5)
Beispiel #4
0
def parseIncomeBase(code):
    FileLogger.info("running on code: %s" % code)
    path = "C:/project/stockdata/USIncome/%s.json" % code
    text = readFile(path)

    if text:
        jsonObj = json.loads(text)
        jsonData = jsonObj['data']
        del jsonData['list']
        jsonData['ts_code'] = code

        global incomeBaseDF
        incomeBaseDF = incomeBaseDF.append([jsonData], ignore_index=True)
Beispiel #5
0
def gettodayStock():
    curDate = time.strftime("%Y%m%d", time.localtime())
    tryagain = True
    while tryagain:
        try:
            content = crawlLatestUsStocks()
            if content:
                path = "C:/project/stockdata/USDay/%s.txt" % curDate
                write2File(path, content, mode="w")
                FileLogger.info("crawl stock list successfully on date:" +
                                curDate)
                tryagain = False
            else:
                time.sleep(60)
        except Exception as ex:
            FileLogger.error(ex)
            FileLogger.error("crawl stock list error, retry in 60 seconds")
            time.sleep(60)
Beispiel #6
0
def crawlHistory(code) -> bool:
    link = "https://stock.xueqiu.com/v5/stock/chart/kline.json?symbol=%s&begin=1616585707592&period=day&type=before&count=-100000&indicator=kline,pe,pb,ps,pcf,market_capital,agt,ggt,balance" % code
    session = HTMLSession()
    r = session.get(link, headers=HEADERS, cookies=COOKIES)

    jsonObj = json.loads(r.content)
    if jsonObj['error_code'] != 0 or not jsonObj["data"].__contains__("column") or not jsonObj["data"].__contains__("item"):
        FileLogger.error("get content error from: %s" % code)
        return False

    columns = jsonObj["data"]["column"]
    items = jsonObj["data"]["item"]

    if len(items) > 0:
        path = "C:/project/stockdata/UShistory/%s.csv" % code
        save2csv(columns, items, path)

    FileLogger.info("get %d lines from code: %s" % (len(items), code))
    return True
Beispiel #7
0
def crawlStockNotices(code, orgId):
    records = []

    link = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    session = HTMLSession()
    data = POSTDATA.copy()
    data["stock"] = "%s,%s" % (code, orgId)
    r = session.post(link, data=data, headers=HEADERS)
    if r.content:
        jsonContent = json.loads(r.content)
        totalpages = jsonContent["totalpages"]
        announcements = jsonContent["announcements"]
        records.extend(announcements)
        FileLogger.info("get records on code: %s of totalPages:%d" %
                        (code, totalpages))

        for pageNum in range(2, totalpages + 2):
            time.sleep(0.1)
            data["pageNum"] = pageNum
            r = session.post(link, data=data, headers=HEADERS)
            if r.content:
                jsonContent = json.loads(r.content)
                announcements = jsonContent["announcements"]
                if announcements is not None and len(announcements) > 0:
                    records.extend(announcements)
                FileLogger.info("get records on pageNum: %d" % pageNum)

        FileLogger.info("get %d records on code: %s" % (len(records), code))

    if len(records) != 0:
        content = json.dumps(records)
        path = "C:/project/stockdata/StockNotices/%s.json" % code
        write2File(path, content)
Beispiel #8
0
def crawlBalance(code, companyType):
    records = []
    for date in DATES:
        link = "http://f10.eastmoney.com/NewFinanceAnalysis/zcfzbAjaxNew?companyType=%d&reportDateType=0&reportType=1&dates=%s&code=%s" % (
            companyType, date, code)
        session = HTMLSession()
        r = session.get(link, headers=HEADERS)
        jsonContent = json.loads(r.content)
        if "data" not in jsonContent:
            FileLogger.info("no more data on %s at dates: %s" % (code, date))
            break
        for obj in jsonContent["data"]:
            records.append(obj)

        FileLogger.info("get balance of code: %s in size: %d" %
                        (code, len(jsonContent["data"])))
        # time.sleep(0.5)

    if len(records) != 0:
        content = json.dumps(records)
        path = "C:/project/stockdata/EastMoneyBalance/%s.json" % code
        write2File(path, content)
Beispiel #9
0
        FileLogger.info("get %d records on code: %s" % (len(records), code))

    if len(records) != 0:
        content = json.dumps(records)
        path = "C:/project/stockdata/StockNotices/%s.json" % code
        write2File(path, content)


if __name__ == "__main__":
    stockList = getJsonFromFile("C:/project/stockdata/StockNotices/stock.json")
    stockList = stockList["stockList"]

    # stockList = [{"orgId":"9900002701","category":"A股","code":"002127","pinyin":"njds","zwjc":"南极电商"}]

    for stock in stockList:
        FileLogger.info("running on stock: %s(%s)" %
                        (stock["zwjc"], stock["code"]))
        filePath = "C:/project/stockdata/StockNotices/%s.json" % stock['code']
        if (os.path.exists(filePath)):
            continue

        try:
            crawlStockNotices(stock["code"], stock["orgId"])
            time.sleep(1)

        except Exception as ex:
            FileLogger.error(ex)
            FileLogger.error("crawl balance error on code: %s" % stock["code"])
            time.sleep(3)
    stockdf = pd.read_csv(
        "C:/project/Tushare/eastmoney/codewithcompanytype.csv")
    stockList = stockdf[['ts_code', 'companytype']].to_numpy()

    stockList = [['SZ000002', 4]]
    # stockList = [['SZ300144', 4]]

    # add the base info into DB
    for item in stockList:
        code = item[0]
        companyType = item[1]
        # need to process companyType 1-3
        if companyType != 4:
            continue

        FileLogger.info("running on code: %s" % code)
        # try:
        incomedf = dataGetter.getDataFromIncome(code)
        incomedf = incomedf.set_index("REPORT_DATE")
        incomedf = processor.keepOnlyYearData(incomedf).fillna(0)
        balancedf = dataGetter.getDataFromBalance(code)
        balancedf = balancedf.set_index("REPORT_DATE")
        balancedf = processor.keepOnlyYearData(balancedf).fillna(0)

        # rate = getIncomeYoY(code, incomedf)
        # rate = getGrossProfitRate(code, incomedf)
        # rate = getNetProfitRate(code, incomedf)
        # rate = getOperateProfitRate(code, incomedf)
        # rate = getProfitRate(code, incomedf)
        # rate = getOperateTaxRate(code, incomedf)
        # rate = getSalesRate(code, incomedf)
Beispiel #11
0
def retrieveAnualQuarterlyReport():
    stockList = getJsonFromFile("C:/project/stockdata/StockNotices/stock.json")
    stockList = stockList["stockList"]

    stockList = [{"orgId":"9900002701","category":"A股","code":"002127","pinyin":"njds","zwjc":"南极电商"}]
    # stockList = [{"orgId":"gssz0000002","category":"A股","code":"000002","pinyin":"njds","zwjc":"万科A"}]

    for stock in stockList:
        FileLogger.info("running on stock: %s(%s)" % (stock["zwjc"], stock["code"]))
    
        try:
            filePath = "C:/project/stockdata/StockNotices/%s.json" % stock['code']
            jsonList = getJsonFromFile(filePath)

            annualDf = None
            for jsonObj in jsonList:
                announcementType = jsonObj['announcementType']
                fileType = jsonObj['adjunctType']

                # 得到公告类型,一季报半年报三季报年报
                # 公告类型:{'01030501': 第一季度报全文, '01030701':第三季度报, '01030301': 半年报, '01030101':年报全文}
                noticeType = None
                if announcementType.find("01030101") != -1: 
                    noticeType = "年报"
                elif announcementType.find("01030701") != -1:
                    noticeType = "三季度报"
                elif announcementType.find("01030301") != -1:
                    noticeType = "半年报"
                elif announcementType.find("01030501") != -1:
                    noticeType = "一季度报"

                if noticeType is not None and (fileType == 'PDF' or filePath == 'PDF ' or fileType == 'pdf'):
                    FileLogger.info("downloading file: %s" % jsonObj["announcementTitle"])
                    noticeDay = jsonObj['adjunctUrl'][10:20]
                    url = "http://www.cninfo.com.cn/new/announcement/download?bulletinId=%s&announceTime=%s" % (jsonObj['announcementId'], noticeDay)
                    
                    annualData = {
                        'code': jsonObj['secCode'],
                        'name': jsonObj['secName'],
                        'announcementId': jsonObj['announcementId'],
                        'title': jsonObj['announcementTitle'], 
                        'noticeDay': noticeDay,
                        'fileType': jsonObj['adjunctType'],
                        'url': url, 
                        'Type': noticeType, 
                        'year': int(noticeDay[0:4])-1 if noticeType == "年报" else int(noticeDay[0:4])
                    }
                    if annualDf is None:
                        annualDf = pd.DataFrame(columns=annualData.keys())
                        annualDf = annualDf.append(annualData, ignore_index=True)
                    else:
                        annualDf = annualDf.append(annualData, ignore_index=True)

            time.sleep(0)
        
            # save to DB
            from sqlalchemy import create_engine
            ENGINE = create_engine("mysql+pymysql://root:4401821211@localhost:3306/eastmoney?charset=utf8")
            annualDf.to_sql(name="reportbasic", con=ENGINE, if_exists="append")

        except Exception as ex:
            FileLogger.error(ex)
            FileLogger.error("retrieve error on code: %s" % stock["code"])
            time.sleep(3)
Beispiel #12
0
if __name__ == "__main__":
# http://www.cninfo.com.cn/new/announcement/bulletin_detail?announceId=13519195&flag=true&announceTime=2004-01-17

    # retrieveAnualQuarterlyReport()

    stockdf = pd.read_csv("C:/project/stockdata/StockNoticesFile/annualreportlist.csv", dtype={'code': np.str, 'year': np.str})
    # stockdf = stockdf[stockdf['code'] == '000002']
    stockList = stockdf[['code', 'name', 'year', 'announcementId', 'url']].to_numpy()

    # stockList = stockList[1:3]
    
    try:
        for stock in stockList: 
            fileName = "[%s]%s年报-%s" % (stock[1], stock[2], stock[3])
            savePath = "C:/project/stockdata/StockNoticesFile/pdf/%s.pdf" % fileName
            # make sure it's a valid path, no \/:?*"<>|
            savePath = savePath.replace("*", "")
            unresolvedPath = "C:/project/stockdata/StockNoticesFile/unresolved/%s.pdf" % fileName

            url = stock[4]
            if os.path.exists(savePath) or os.path.exists(unresolvedPath):
                FileLogger.info("file %s exists, skip!" % fileName)
            else:
                FileLogger.info("downloading file: %s" % fileName)
                downloadFile(url, savePath) 

    except Exception as ex:
        FileLogger.error(ex)
        FileLogger.error("download error on file: %s" % fileName)
        time.sleep(3)