Beispiel #1
0
def main():
    sqlSessionFactory('172.16.19.203','data','opensesame','img_upload',20)
    while True:
        s2ss =sqlSessionFactory.select(GET_STATION)
        if len(s2ss)==0:
            break
        for s in s2ss:
            id = s[0]
            start_station = s[1]
            end_station = s[2]
            s_alia = sqlSessionFactory.select(GET_ALIA_STATION % start_station)
            if len(s_alia)==0:
                s_alia =  py_util.hanzi2pinyin_split(string=start_station, split="", firstcode=False)
            else:
                s_alia = s_alia[0][0]
            e_alia = sqlSessionFactory.select(GET_ALIA_STATION % end_station)
            if len(e_alia)==0:
                e_alia =  py_util.hanzi2pinyin_split(string=end_station, split="", firstcode=False)
            else:
                e_alia = e_alia[0][0]
            try:
                insert_sql =INSERT_S2S % (start_station,s_alia,end_station,e_alia)
                print insert_sql
                sqlSessionFactory.execute(insert_sql)
            except:
                pass
            update = UPDATE_S2S % id
            sqlSessionFactory.execute(update)
Beispiel #2
0
def main():
    # 1.检测可用代理
    global factory
    factory = sqlSessionFactory('192.100.2.31', 'data', 'opensesame',
                                'traincrawler', 10)
    util = ProxyUtil(
        'http://api.sports.sina.com.cn/?p=nba&s=match&a=dateMatches&format=json&callback=NBA_JSONP_CALLBACK&date=2017-03-30&dpc=1'
    )
    util.jobStart(factory)
    useful_proxy = util.getProxy()
    global http_util
    http_util = httpUtil(useful_proxy)
    # 2.调度中心开始抓取
    center = ScheduleCenter(20)
Beispiel #3
0
        content = html_old.read()
        html_old.close()
        parse_ss_json(content, train_no)
    print total


def part_main():
    # temp_trains = ['6267','6285','C6501','C6505','C6509','D5113','D5517','D6410','D6412','D6418','D6422','D8324','D8386','D932','D936','D942','G1839','G1840','G1855','G1857','K1169','K1321','K1322','K1349','K1351','K1353','K1531','K1533','K1617','K1807','K1808','K1888','K2368','K2615','K2617','K2618','K4332','K4333','K4346','K4347','K5156','K5158','K59','K61','K7222','K7307','K7308','K749','K789','K791','K792','K803','K804','K805','K8483','K861','K863','K9069','K9070','K9505','K9507','K9508','K9510','T153','T283','T8317','T8327','T8711','T9591','T9594']
    temp_trains = [
        'C6903', 'C6905', 'C6907', 'C6909', 'C6911', 'C6913', 'C6915', 'C6917',
        'C6919', 'C6921', 'C6923', 'C6925', 'C6953', 'C6955', 'C6957', 'C6959',
        'C6961', 'C6963', 'C6965', 'C6967', 'C6969', 'C6971', 'C6973', 'C6975',
        'C6902', 'C6904', 'C6906', 'C6908', 'C6910', 'C6912', 'C6918', 'C6920',
        'C6922', 'C6924', 'C6926', 'C6952', 'C6954', 'C6956', 'C6958', 'C6960',
        'C6962', 'C6966', 'C6968', 'C6970', 'C6972', 'C6974', 'C6976', 'C6929',
        'C6914', 'C6928', 'C6930', 'C6978', 'C6964', 'C6980', 'C6977', 'C6979',
        'C6981'
    ]
    for t in temp_trains:
        file_name = '%s%s.html' % (base_train_list_path, t)
        html_old = codecs.open(file_name, 'r', 'utf-8')
        content = html_old.read()
        html_old.close()
        print t
        parse_ss_json(content, t)


if __name__ == '__main__':
    sqlSessionFactory('172.16.19.203', 'data', 'opensesame', 'img_upload', 20)
    main()
Beispiel #4
0
from MysqlUtil import sqlSessionFactory
import Queue
import time
import threading
import codecs
from lxml import etree
import sys
reload(sys)
import os
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
base_path = 'E:/XC/xc-price-html/%s-%s-%s.html'
getPageNameSql = "select * from train_stop__task_xc limit 1"
myDao = sqlSessionFactory('192.100.2.31', 'data', 'opensesame', 'traincrawler',
                          10)


def praseHtml(filePath):
    try:
        html_io = codecs.open(filePath, 'r', 'gb2312')
        content = html_io.read()
        html_io.flush()
        html_io.close()
        tree = etree.HTML(content)
        tbodys = tree.xpath(
            "descendant::table[@class='tb_railway_list']/tbody")
        trs = tbodys[1].xpath("tr")
        for tr in trs:
            tds = tr.xpath("td")
            tds[1].xpath("")