Example #1
0
def urlCrawler(urlQueue, logQueue):
    rooturl =  {
        'lawfirm':  'http://www.legalminer.com/search/lawfirm?t=',
        'lawyer':   'http://www.legalminer.com/search/lawyer?t=',
        'court':    'http://www.legalminer.com/search/court?t=',
        'judge':    'http://www.legalminer.com/search/judge?t=',
        'corporate':'http://www.legalminer.com/search/corporate?t='
    }
    rootNum = {}
    for key, url in rooturl.items():
        content = downLoad(url)
        tree = etree.HTML(content)
        resultNumNode = tree.xpath(urlPattern.resultNumXpath)
        rootNum[key] = urlPattern.extractNum(resultNumNode[0].text)
    #每个链接创建一个生产线程提取url
    threadQueue = TQueue()
    ThreadLst = []
    for key, value in rootNum.items():
        extractor = Extractor()
        extractor.setResultNum(key, int(value), threadQueue)
        ThreadLst.append(extractor)
    transmit = Transmit()
    transmit.setEndNum(len(rootNum), threadQueue, urlQueue, CONTENT_CRAWLER_NUM)
    ThreadLst.append(transmit)
    for e in ThreadLst:
        e.start()
    for e in ThreadLst:
        e.join()
def processCourt(urlC, queue, conn):
    #print 'processCourt'
    content = downLoad(urlC)

    xm_path = "/html/body[@class='lvzhi court']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw heading']/div[@class='title']"
    ssaj_path = "/html/body[@class='lvzhi court']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell']/a[@class='rw val textLink']"

    tree=etree.HTML(content)

    node_xm = tree.xpath(xm_path)
    node_ssaj = tree.xpath(ssaj_path)

    reLst = []

    reLst.append(urlC)
    court_name = u'未知'
    case_number = 0

    if node_xm[0].text != None:
        court_name = node_xm[0].text.strip()
    if node_ssaj[0].text != None:
        case_number = extractNum(node_ssaj[0].text)

    reLst.append(court_name)
    reLst.append(case_number)

    try:
        cur=conn.cursor()
        cur.execute('insert into court values(%s, %s, %s)', reLst)
        conn.commit()
        cur.close()
    except MySQLdb.Error,e:
        queue.put(urlC)
 def run(self):
     idPattern = re.compile(r'<a class="title s" href="(.*?)">', re.M)
     urlPrefix = 'http://www.legalminer.com'
     startPage = 1
     endPage = self.resultNum/10 + 2 #开区间
     for pNum in range(startPage, endPage):
         urlPattern = 'http://www.legalminer.com/ajax_search/get_html?t=&page=%d&searchType=%s' % (pNum, self.lastType)
         content = downLoad(urlPattern)
         contentDict = json.loads(content)
         for html in contentDict['result']['html']:
             sr = idPattern.search(html)
             lid = sr.group(1)
             urlC = urlPrefix + lid
             #传入队列中
             self.queue.put(urlC)
     #结束
     self.queue.put('OVER')
def processLawFirm(urlC, queue, conn):
    #print 'processLawFirm'
    content = downLoad(urlC)

    mc_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw heading']/div[@class='title']"
    zh_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][1]"
    dz_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][2]"
    dh_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][3]"
    cz_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][4]"
    gw_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][5]/a"
    ssaj_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][1]/a[@class='rw val textLink']"
    slsj_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][2]/div[@class='rw val']"
    zyzt_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][3]/div[@class='rw val']"
    fzr_path="/html/body[@class='lvzhi lawfirm']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][4]/div[@class='rw val']"

    tree=etree.HTML(content)

    pattern_zh = re.compile(r'<div class="rw val"><i class="fa fa-info-circle icon highlight"></i>(.*?)</div>',re.M)
    pattern_dz = re.compile(r'<div class="rw val"><i class="fa fa-location-arrow icon highlight"></i>(.*?)</div>',re.M)
    pattern_dh = re.compile(r'<div class="rw val"><i class="fa fa-phone icon highlight"></i>(.*?)</div>',re.M)
    pattern_cz = re.compile(r'<div class="rw val"><i class="fa fa-print icon highlight"></i>(.*?)</div>',re.M)
    pattern_gw = re.compile(r'<div class="rw val"><a href="(.*?)" target="_blank"><i class="fa fa-external-link icon highlight"></i>官网链接</a></div>',re.M)
    pattern_lssl = re.compile(r'>([0-9]*?)位律师</a>',re.M)

    node_mc = tree.xpath(mc_path)
    node_ssaj = tree.xpath(ssaj_path)
    node_slsj = tree.xpath(slsj_path)
    node_zyzt = tree.xpath(zyzt_path)
    node_fzr = tree.xpath(fzr_path)

    reLst = []

    match1 = pattern_zh.search(content)
    match2 = pattern_dz.search(content)
    match3 = pattern_dh.search(content)
    match4 = pattern_cz.search(content)
    match5 = pattern_gw.search(content)
    match6 = pattern_lssl.search(content)
    
    reLst.append(urlC)
    law_firm_name = u'未知';
    law_firm_lawyer_number = 0;
    occupation_number = u'未知';
    address = u'未知';
    phone_number = u'未知';
    fax_number = u'未知';
    link = u'未知'
    case_number = 0
    history = 0
    state = u'未知'
    principle = u'未知'
    if node_mc[0].text != None:
        law_firm_name = node_mc[0].text.strip().replace('\n','').replace('\t','').replace('\r','').split('  ')[0].strip()
    if match6 != None:
        law_firm_lawyer_number = int(match6.group(1))
    if match1 != None:
        occupation_number = match1.group(1)
        occupation_number = occupation_number if occupation_number.find(' ') == -1 else occupation_number.split(' ')[1]
    if match2 != None:
        address = match2.group(1)
    if match3 != None:
        phone_number = match3.group(1)
    if match4 != None:
        fax_number = match4.group(1)
    if match5 != None:
        link = match5.group(1)
    if node_ssaj[0].text!=None:
        case_number = extractNum(node_ssaj[0].text.strip())
    if node_slsj[0].text!=None:
        history = extractNum(node_slsj[0].text.strip())
    if node_zyzt[0].text!=None:
        state = node_slsj[0].text.strip()
    if node_fzr[0].text!=None:
        principle = node_fzr[0].text.strip()

    reLst.append(law_firm_name)
    reLst.append(law_firm_lawyer_number)
    reLst.append(occupation_number)
    reLst.append(address)
    reLst.append(phone_number)
    reLst.append(fax_number)
    reLst.append(link)
    reLst.append(case_number)
    reLst.append(history)
    reLst.append(state)
    reLst.append(principle)

    try:
        cur=conn.cursor()
        cur.execute('insert into lawfirm values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', reLst)
        conn.commit()
        cur.close()
    except MySQLdb.Error,e:
        queue.put(urlC)
def processCorporate(urlC, queue, conn):
    #print 'processCorporate'
    content = downLoad(urlC)

    gspath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw heading']/div[@class='title']"
    gppath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='col']/div[@class='rw val'][1]"
    dzpath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='col']/div[@class='rw val'][2]"
    dhpath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='col']/div[@class='rw val'][3]"
    gwpath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='col']/div[@class='rw val'][4]/a"
    sjpath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list pull-right']/div[@class='col']/div[@class='rw val'][1]"
    dbpath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list pull-right']/div[@class='col']/div[@class='rw val'][2]"
    hypath="/html/body[@class='lvzhi corporate']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list pull-right']/div[@class='col']/div[@class='rw val'][3]"

    tree=etree.HTML(content)

    node_gs = tree.xpath(gspath)
    node_gp = tree.xpath(gppath)
    node_dz = tree.xpath(dzpath)
    node_dh = tree.xpath(dhpath)
    node_gw = tree.xpath(gwpath)
    node_sj = tree.xpath(sjpath)
    node_db = tree.xpath(dbpath)
    node_hy = tree.xpath(hypath)

    reLst = []

    reLst.append(urlC)
    corporate_name = u'未知'
    stock_id = u'未知'
    address = u'未知'
    phone = u'未知'
    link = u'未知'
    ipo_time = u'未知'
    principle = u'未知'
    industry = u'未知'

    if node_gs[0].text != None:
        corporate_name = node_gs[0].text
    if node_gp[0].text != None:
        stock_id = node_gp[0].text
    if node_dz[0].text != None:
        address = node_dz[0].text
    if len(node_dh) != 0:
        phone = "".join(node_dh[0].itertext()).strip().replace('\n','').replace('\t','').replace('\r','')
    if len(node_gw) != 0:
        link = node_gw[0].attrib.get('href','None')
    if len(node_sj) != 0:
        ipo_time = "".join(node_sj[0].itertext()).strip().replace('\n','').replace('\t','').replace('\r','')
    if len(node_db) != 0:
        principle = "".join(node_db[0].itertext()).strip().replace('\n','').replace('\t','').replace('\r','')
    if len(node_hy) != 0:
        industry = "".join(node_hy[0].itertext()).strip().replace('\n','').replace('\t','').replace('\r','')

    reLst.append(corporate_name)
    reLst.append(stock_id)
    reLst.append(address)
    reLst.append(phone)
    reLst.append(link)
    reLst.append(ipo_time)
    reLst.append(principle)
    reLst.append(industry)

    try:
        cur=conn.cursor()
        cur.execute('insert into corporate values(%s, %s, %s, %s, %s, %s, %s, %s, %s)', reLst)
        conn.commit()
        cur.close()
    except MySQLdb.Error,e:
        queue.put(urlC)
def processLawyer(urlC, queue, conn):
    #print 'processLawyer'
    content = downLoad(urlC)

    xm_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw heading']/div[@class='title']"
    zyzh_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val highlight']"
    lsmc_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='list']/div[@class='rw val'][1]"
    ssaj_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][1]/a[@class='rw val textLink']"
    jy_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][2]/div[@class='rw val']"
    xl_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][3]/div[@class='rw val']"
    xb_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][4]/div[@class='rw val']"
    zyzt_path="/html/body[@class='lvzhi lawyer']/div[@id='bodyWrapper']/div[@id='content']/div[@id='basicInfo']/div[@class='content row']/div[@class='info col-sm-10']/div[@class='rw']/div[@class='info-cell pull-right']/div[@class='cell'][5]/div[@class='rw val']"
    
    tree=etree.HTML(content)

    node_xm = tree.xpath(xm_path)
    node_zyzh = tree.xpath(zyzh_path)
    node_lsmc = tree.xpath(lsmc_path)
    node_ssaj = tree.xpath(ssaj_path)
    node_jy = tree.xpath(jy_path)
    node_xl = tree.xpath(xl_path)
    node_xb = tree.xpath(xb_path)
    node_zyzt = tree.xpath(zyzt_path)

    reLst = []

    reLst.append(urlC)
    lawyer_name = u'未知'
    occupation_number = u'未知'
    law_firm_name = u'未知'
    case_number = 0
    history = 0
    education = u'未知'
    gender = u'未知'
    state = u'未知'

    if node_xm[0].text != None:
        lawyer_name = node_xm[0].text.split(' ')[0]
    if node_zyzh[0].text != None:
        occupation_number = node_zyzh[0].text.split(' ')[1]
    if node_lsmc[0].text != None:
        law_firm_name = node_lsmc[0].text
    if node_ssaj[0].text != None:
        case_number = extractNum(node_ssaj[0].text)
    if node_jy[0].text != None:
        history = extractNum(node_jy[0].text)
    if node_xl[0].text != None:
        education = node_xl[0].text
    if node_xb[0].text != None:
        gender = node_xb[0].text
    if node_zyzt[0].text != None:
        state = node_zyzt[0].text

    reLst.append(lawyer_name)
    reLst.append(occupation_number)
    reLst.append(law_firm_name)
    reLst.append(case_number)
    reLst.append(history)
    reLst.append(education)
    reLst.append(gender)
    reLst.append(state)

    try:
        cur=conn.cursor()
        cur.execute('insert into lawyer values(%s, %s, %s, %s, %s, %s, %s, %s, %s)', reLst)
        conn.commit()
        cur.close()
    except MySQLdb.Error,e:
        queue.put(urlC)