Ejemplo n.º 1
0
    def __init__(self, pages,Base,lastdate):
        threading.Thread.__init__(self)
        self.lastdate = lastdate
        self.thread = threading.Thread(target=self.run, name="Engine")

        self.pages = pages
        self.con = MySQLAlchemy(Base,report,"stock")
        self.headers = {
            'Connection': ' keep-alive',
            'Upgrade-Insecure-Requests': ' 1',
            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        }
        dlurl = 'http://www.hibor.com.cn/toplogin.asp?action=login'
        datapost = {"name": "xuzhipeng8", "pwd": 'xuzhipeng8261426', 'tijiao.x': '12', 'tijiao.y': '2',
                    'checkbox': 'on'}
        postdata = urllib.parse.urlencode(datapost).encode("utf-8")
        req = urllib.request.Request(dlurl, postdata, headers=self.headers)
        cjar = http.cookiejar.CookieJar()
        self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
        urllib.request.install_opener(self.opener)
        file=self.opener.open(req)
        data = file.read()
        file = open("pages.html", "wb")
        file.write(data)
        file.close()
Ejemplo n.º 2
0
    def __init__(self, pages,Base, lastdate):
        threading.Thread.__init__(self)
        self.lastdate = lastdate
        self.thread = threading.Thread(target=self.run, name="Engine")

        self.pages = pages
        self.con = MySQLAlchemy(Base,report, "stock")
        self.headers = {
            'Connection': ' keep-alive',
            'Upgrade-Insecure-Requests': ' 1',
            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        }
Ejemplo n.º 3
0
    def __init__(self, tradate, indexCode, benchmark):
        self.indexCode = indexCode
        self.tradate = tradate
        self.benchmark = benchmark
        self.factor = None
        self.data = None
        self.indextimeseries = None
        codelist = w.wset(
            'indexconstituent',
            "date=" + self.tradate + ";windcode=" + self.indexCode)
        self.codelist = pd.DataFrame(codelist.Data,
                                     columns=codelist.Codes,
                                     index=codelist.Fields,
                                     dtype=float).T
        self.list = pd.DataFrame(codelist.Data,
                                 columns=codelist.Codes,
                                 index=codelist.Fields,
                                 dtype=float).T["wind_code"].tolist()

        self.tradedate2 = datetime.strptime(self.tradate,
                                            '%Y-%m-%d').strftime('%Y%m%d')
        self.startdate = w.tdaysoffset(
            -1, self.tradate, "Period=M").Data[0][0].strftime('%Y%m%d')
        self.enddate = w.tdaysoffset(1, self.tradate,
                                     "Period=M").Data[0][0].strftime('%Y%m%d')
        self.con = MySQLAlchemy(Base, factor, "stock")
Ejemplo n.º 4
0
class ThreadUrl2(threading.Thread):
    def __init__(self, pages,Base, lastdate):
        threading.Thread.__init__(self)
        self.lastdate = lastdate
        self.thread = threading.Thread(target=self.run, name="Engine")

        self.pages = pages
        self.con = MySQLAlchemy(Base,report, "stock")
        self.headers = {
            'Connection': ' keep-alive',
            'Upgrade-Insecure-Requests': ' 1',
            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        }

    def conn(self):

        dlurl = 'http://www.hibor.com.cn/toplogin.asp?action=login'
        datapost = {"name": "xuzhipeng8", "pwd": 'xuzhipeng8261426', 'tijiao.x': '12', 'tijiao.y': '2',
                    'checkbox': 'on'}
        postdata = urllib.parse.urlencode(datapost).encode("utf-8")
        req = urllib.request.Request(dlurl, postdata, headers=self.headers)
        cjar1 = http.cookiejar.CookieJar()
        try:

            proxy = urllib.request.ProxyHandler({'https': '127.0.0.1:4973'})

            self.opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler,urllib.request.HTTPCookieProcessor(cjar1))

            urllib.request.install_opener(self.opener)
            file = self.opener.open(req)
            data = file.read()
            file = open("pages.html", "wb")
            file.write(data)
            file.close()

        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)

            if hasattr(e,"reason"):
                print(e.reason)
            time.sleep(5)

        except Exception as e:
            print("exception:" +str(e))
            time.sleep(1)


    def run(self):
        file = open('errorlog.txt', "a+")
        while True:
            url = self.pages.get()
            print("爬取", url)
            get_request = urllib.request.Request(url, headers=self.headers)
            try:
                data2 = self.opener.open(get_request).read()
            except:
                print("等待5秒")
                time.sleep(5)
                try:
                    data2 = self.opener.open(get_request).read()
                except:
                    file.write(url)
                    file.write("/n")
                    continue

            soup_all = BeautifulSoup(data2, "html5lib")
            soup = soup_all.findAll('td', {"class": "td_spantxt"})
            soup_title = soup_all.findAll('span', {"class": "tab_lta"})

            pddata = pd.DataFrame([], columns=["券商","行业名称" ,"标题", "日期", "类别", "作者", "评级", "页数"])
            for i in range(len(soup)):
                if re.search("\w+行业", soup_title[i].text) == None:
                    continue

                pddata.loc[i] = [
                    re.search("\w+-", soup_title[i].text).group()[0:4],
                    re.search("\w+行业", soup_title[i].text).group(),
                    soup_title[i].text,
                    soup[i].find_all("span")[0].text,
                    "行业分析",
                    soup[i].find_all("span")[2].text[3:],
                    soup[i].find_all("span")[3].text[3:],
                    soup[i].find_all("span")[4].text[3:][:-1]
                    ]
                #print(soup2[i])
                #error_data = str(soup2[i])
                #error = open(".\error_data.txt", "r")
                #error.write(error_data)
                #error.write("\n")
                #error.close()
            deltatime = arrow.get(pddata["日期"].max(), "YYYY-MM-DD") - arrow.get(self.lastdate, "YYYY-MM-DD")
            print("爬取至日期", pddata["日期"].max(), "目标日期", self.lastdate)

            reports = [industrial(name=pddata["券商"][j], industrial=pddata["行业名称"][j],title=pddata["标题"][j], date=pddata["日期"][j], \
                             classes=pddata["类别"][j], author=pddata["作者"][j], score=pddata["评级"][j],\
                             pages=int(pddata["页数"][j])) for j in pddata.index.tolist()]
            self.con.insert(reports, 2)

            if deltatime.days < 0:
                file.close()
                break
            if self.pages.empty():
                file.close()
                break