Beispiel #1
0
def _contact_qq(src):  
    if src is None or src==''or src.find("1234")==0:
        return ''
    src=common.strQ2B(src)
    if src.find("58.com")>0:
        return src
    
    if (not src.startswith("http")) and src.find("/plugs/pic/")>=0:
        src="http://www.sc2car.com%s"%src
        
    if (not src.startswith("http")) and src.find("/tel/")>=0:
        src="http://www.ganji.com%s"%src
        
    if (not src.startswith("http")) and src.find("upload/htmlImage")>=0:
        src="http://www.ln2car.com%s"%src
        
    reobj = re.compile(u'[^0-9]',re.IGNORECASE)
    result = reobj.sub(r'', src)
    qq_v = u'([1-9]{1}\d{4,11})$' #0757-83966080
    matchObj = re.compile(qq_v)
    m = matchObj.match(result)
    if m is not None:
        result = m.group(1)
        return result
    else:
        return ""
Beispiel #2
0
def _contact_qq(src):
    if src is None or src == '' or src.find("1234") == 0:
        return ''
    src = common.strQ2B(src)
    if src.find("58.com") > 0:
        return src

    if (not src.startswith("http")) and src.find("/plugs/pic/") >= 0:
        src = "http://www.sc2car.com%s" % src

    if (not src.startswith("http")) and src.find("/tel/") >= 0:
        src = "http://www.ganji.com%s" % src

    if (not src.startswith("http")) and src.find("upload/htmlImage") >= 0:
        src = "http://www.ln2car.com%s" % src

    reobj = re.compile(u'[^0-9]', re.IGNORECASE)
    result = reobj.sub(r'', src)
    qq_v = u'([1-9]{1}\d{4,11})$'  #0757-83966080
    matchObj = re.compile(qq_v)
    m = matchObj.match(result)
    if m is not None:
        result = m.group(1)
        return result
    else:
        return ""
Beispiel #3
0
def _contact_mail(src):  
    if src is None or src=='':
        return ''
    src=common.strQ2B(src)
    mail_v=u"(.*)\u79FB\u52A8"
    matchObj = re.compile(mail_v)
    m = matchObj.match(src)
    if m is not None:
        src = m.group(1)
#        print src   
    reobj = re.compile(u'[\u3002]',re.IGNORECASE)
    
    result = reobj.sub(r'.', src)#替换。为.
    reobj = re.compile(u'[^\_\-\u002E\u0040a-zA-Z\d]',re.IGNORECASE)
    result = reobj.sub(r'', result)#替换非.整形@字符类数据
    result=result.replace("www.","")
#    print result
    mail_v=u"(.*)\u0040(\w+)\u002E(.*)"
    matchObj = re.compile(mail_v)
    m = matchObj.match(result)
    if m is not None:
        name = m.group(1)
        com = m.group(3)
        yu = m.group(2)
        reobj = re.compile(u'[^\w\d\_\-]',re.IGNORECASE)
        name = reobj.sub(r'', name)#替换非.整形@字符类数据
        reobj = re.compile(u'[^\w\d\u002E]',re.IGNORECASE)
        com = reobj.sub(r'', com)#替换非.整形.字符类数据
        result=name+"@"+yu+"."+com
        return result
    else:
        return ""
Beispiel #4
0
def _contact_mail(src):
    if src is None or src == '':
        return ''
    src = common.strQ2B(src)
    mail_v = u"(.*)\u79FB\u52A8"
    matchObj = re.compile(mail_v)
    m = matchObj.match(src)
    if m is not None:
        src = m.group(1)


#        print src
    reobj = re.compile(u'[\u3002]', re.IGNORECASE)

    result = reobj.sub(r'.', src)  #替换。为.
    reobj = re.compile(u'[^\_\-\u002E\u0040a-zA-Z\d]', re.IGNORECASE)
    result = reobj.sub(r'', result)  #替换非.整形@字符类数据
    result = result.replace("www.", "")
    #    print result
    mail_v = u"(.*)\u0040(\w+)\u002E(.*)"
    matchObj = re.compile(mail_v)
    m = matchObj.match(result)
    if m is not None:
        name = m.group(1)
        com = m.group(3)
        yu = m.group(2)
        reobj = re.compile(u'[^\w\d\_\-]', re.IGNORECASE)
        name = reobj.sub(r'', name)  #替换非.整形@字符类数据
        reobj = re.compile(u'[^\w\d\u002E]', re.IGNORECASE)
        com = reobj.sub(r'', com)  #替换非.整形.字符类数据
        result = name + "@" + yu + "." + com
        return result
    else:
        return ""
Beispiel #5
0
def _contact(src,spider):  
    """
            联系人处理,除去非中文字符
    """
    if src is None or src=='':
        return ''
    src=common.strQ2B(src)
    contact_v = re.compile(u'([\u4e00-\u9fa5a]+)',re.IGNORECASE)
    matchObj = re.compile(contact_v)
    m = matchObj.match(src)
    result=""
    if m is not None:
        result = m.group(1)
    if spider=="iautos":
       old_result=result[:2]
#       print old_result
       if result.count(old_result)>1:
           result=result[:len(result)/2]
        
    return result
Beispiel #6
0
def _contact(src, spider):
    """
            联系人处理,除去非中文字符
    """
    if src is None or src == '':
        return ''
    src = common.strQ2B(src)
    contact_v = re.compile(u'([\u4e00-\u9fa5a]+)', re.IGNORECASE)
    matchObj = re.compile(contact_v)
    m = matchObj.match(src)
    result = ""
    if m is not None:
        result = m.group(1)
    if spider == "iautos":
        old_result = result[:2]
        #       print old_result
        if result.count(old_result) > 1:
            result = result[:len(result) / 2]

    return result
Beispiel #7
0
def _contact_mobile_phone(src,type,spider): 
    phone=""
    mobile=""
#    print src
#    src=u"024-88886588"
#    print common.unicoding("-") 
    if src is None or src=='':
        return "",""
    if spider=="iautos":
        old_src=src[:8]
#        print old_src
#        print src
        if src.count(old_src)>1:
            src=src[:len(src)/2]
#        print src
    if (not src.startswith("http")) and (src.find("/plugs/pic/")>=0 or src.find("/plugs/phone/")>=0):
        src="http://www.sc2car.com%s"%src
        
    if (not src.startswith("http")) and src.find("/tel")>=0:
        src="http://www.ganji.com%s"%src
        
    if src.count("/tel/")>1:
        number= src.rfind("/tel/") 
        src=src[:number]
        
    if (not src.startswith("http")) and src.find("systems/codeimage")>=0:
        src="http://www.2duche.com%s"%src
        
    if (not src.startswith("http")) and src.find("upload/htmlImage")>=0:
        src="http://www.ln2car.com%s"%src
    
    if src.count("/")>=2:
        if type==1:
            return src,""
        else:
            return "",src

    src=common.strQ2B(src)
    reobj = re.compile(u'[^0-9\u002D]',re.IGNORECASE)
    result = reobj.sub(r' ', src)
#    print result
    phone=""
    mobile=""
    if result.count(" ")>0:
        results= result.split(" ")
        for result1 in results:
            if len(result1)==11:
                mobile=result1
            elif len(result1)>8:
                phone=result1
        if phone!="" or mobile!="":
            return phone,mobile          
                
    
    if result.startswith(u"400"):
        phone=result[:12]
    else:
        result=re.sub(u"\u0020", "",result)
        phone_v = u'(\d{8}\u002D\d+)$' #0757-83966080
        matchObj = re.compile(phone_v)
        m = matchObj.match(result)
        if m is not None:
            phone = m.group(1)
        else:
            phone_v = u'.*(\d{4}\u002D\d+)$' #0757-83966080
            matchObj = re.compile(phone_v)
            m = matchObj.match(result)
            if m is not None:
                phone = m.group(1)
                
            else:
                phone_v = u'.*(\d{3}\u002D\d+)$' #0757-83966080
                matchObj = re.compile(phone_v)
                m = matchObj.match(result)
                if m is not None:
                    phone = m.group(1)
                    
                else:
                    phone_v = u'(0\d+)' #0757-83966080
                    matchObj = re.compile(phone_v)
                    m = matchObj.match(result)
                    if m is not None:
                        phone = m.group(1)
                    else:
                        if len(result)>=6 and len(result)<=10:
                            phone=result
                
    mobile_v=u"([1][0-9]{10})"
    matchObj = re.compile(mobile_v)
    m = matchObj.match(result)
    if m is not None:
        mobile=m.group(1)
        
    if mobile!="":
        phone=mobile
    return phone,mobile
Beispiel #8
0
def file_seg_process(filename, method):
    '''
    @param filename: 文件名
    @param method:   分词算法 { 0:正向,1:逆向 }
    '''
    # 打开文件
    fp_dict = open('dict.txt')
    fp_input = open('corpus/'+filename)
    fp_output = open('corpus_seg/'+filename, 'w')
    
    wordDict = {} 
    # 读取字典到内存中
    for eachWord in fp_dict:
        wordDict[u(eachWord.split()[0].strip(), CODEC)] = 1

    # 对input每一行操作
    str = ''
    for eachLine in fp_input:
        line_out = ''
        # 每一段作为一行输入给分词函数
        sub = strQ2B(u(eachLine.strip(), CODEC))
        if not sub.startswith('  '):
            str += sub
            continue
        strlen = len(str)
        while strlen > 0:
            # 英文字符或数字--原文输出
            m = re.match(r'\w+', str)
            if m is not None:
                subStr = m.group()
                line_out += subStr.encode(CODEC)+'/'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
                continue
            # 短句结尾标志--输出换行
            if str[0:1].encode(CODEC) in [',','。','!','?',':']:
                subStr = str[0:1]
                line_out += '\n'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
            # 汉字--分词处理,输出 词/词
            m = re.match(ur'[\u4e00-\u9fa5]+', str)
            if m is not None:
                subStr = m.group()
                if method == 0:
                    # 正向最大匹配
                    wordList = fwd_mm_seg(wordDict, 8, subStr)
                else:
                    # 逆向最大匹配
                    wordList = bwd_mm_seg(wordDict, 8, subStr)
                line_out += wordList[0].encode(CODEC)+'/'
                for eachWord in wordList[1:]:
                    line_out += eachWord.encode(CODEC)+'/'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
                continue
            # 其他特殊字符--跳过
            str = str[1:]
            strlen = strlen - 1
        # 跳过处理后为空行的段落
        if len(line_out.strip()) == 0:
            continue
        # 写入文件
        fp_output.write(line_out + '\n')
        str = sub
    # close file
    fp_input.close()
    fp_dict.close()
    fp_output.close()
Beispiel #9
0
def _contact_mobile_phone(src, type, spider):
    phone = ""
    mobile = ""
    #    print src
    #    src=u"024-88886588"
    #    print common.unicoding("-")
    if src is None or src == '':
        return "", ""
    if spider == "iautos":
        old_src = src[:8]
        #        print old_src
        #        print src
        if src.count(old_src) > 1:
            src = src[:len(src) / 2]
#        print src
    if (not src.startswith("http")) and (src.find("/plugs/pic/") >= 0
                                         or src.find("/plugs/phone/") >= 0):
        src = "http://www.sc2car.com%s" % src

    if (not src.startswith("http")) and src.find("/tel") >= 0:
        src = "http://www.ganji.com%s" % src

    if src.count("/tel/") > 1:
        number = src.rfind("/tel/")
        src = src[:number]

    if (not src.startswith("http")) and src.find("systems/codeimage") >= 0:
        src = "http://www.2duche.com%s" % src

    if (not src.startswith("http")) and src.find("upload/htmlImage") >= 0:
        src = "http://www.ln2car.com%s" % src

    if src.count("/") >= 2:
        if type == 1:
            return src, ""
        else:
            return "", src

    src = common.strQ2B(src)
    reobj = re.compile(u'[^0-9\u002D]', re.IGNORECASE)
    result = reobj.sub(r' ', src)
    #    print result
    phone = ""
    mobile = ""
    if result.count(" ") > 0:
        results = result.split(" ")
        for result1 in results:
            if len(result1) == 11:
                mobile = result1
            elif len(result1) > 8:
                phone = result1
        if phone != "" or mobile != "":
            return phone, mobile

    if result.startswith(u"400"):
        phone = result[:12]
    else:
        result = re.sub(u"\u0020", "", result)
        phone_v = u'(\d{8}\u002D\d+)$'  #0757-83966080
        matchObj = re.compile(phone_v)
        m = matchObj.match(result)
        if m is not None:
            phone = m.group(1)
        else:
            phone_v = u'.*(\d{4}\u002D\d+)$'  #0757-83966080
            matchObj = re.compile(phone_v)
            m = matchObj.match(result)
            if m is not None:
                phone = m.group(1)

            else:
                phone_v = u'.*(\d{3}\u002D\d+)$'  #0757-83966080
                matchObj = re.compile(phone_v)
                m = matchObj.match(result)
                if m is not None:
                    phone = m.group(1)

                else:
                    phone_v = u'(0\d+)'  #0757-83966080
                    matchObj = re.compile(phone_v)
                    m = matchObj.match(result)
                    if m is not None:
                        phone = m.group(1)
                    else:
                        if len(result) >= 6 and len(result) <= 10:
                            phone = result

    mobile_v = u"([1][0-9]{10})"
    matchObj = re.compile(mobile_v)
    m = matchObj.match(result)
    if m is not None:
        mobile = m.group(1)

    if mobile != "":
        phone = mobile
    return phone, mobile