def _contact_qq(src): if src is None or src==''or src.find("1234")==0: return '' src=common.strQ2B(src) if src.find("58.com")>0: return src if (not src.startswith("http")) and src.find("/plugs/pic/")>=0: src="http://www.sc2car.com%s"%src if (not src.startswith("http")) and src.find("/tel/")>=0: src="http://www.ganji.com%s"%src if (not src.startswith("http")) and src.find("upload/htmlImage")>=0: src="http://www.ln2car.com%s"%src reobj = re.compile(u'[^0-9]',re.IGNORECASE) result = reobj.sub(r'', src) qq_v = u'([1-9]{1}\d{4,11})$' #0757-83966080 matchObj = re.compile(qq_v) m = matchObj.match(result) if m is not None: result = m.group(1) return result else: return ""
def _contact_qq(src): if src is None or src == '' or src.find("1234") == 0: return '' src = common.strQ2B(src) if src.find("58.com") > 0: return src if (not src.startswith("http")) and src.find("/plugs/pic/") >= 0: src = "http://www.sc2car.com%s" % src if (not src.startswith("http")) and src.find("/tel/") >= 0: src = "http://www.ganji.com%s" % src if (not src.startswith("http")) and src.find("upload/htmlImage") >= 0: src = "http://www.ln2car.com%s" % src reobj = re.compile(u'[^0-9]', re.IGNORECASE) result = reobj.sub(r'', src) qq_v = u'([1-9]{1}\d{4,11})$' #0757-83966080 matchObj = re.compile(qq_v) m = matchObj.match(result) if m is not None: result = m.group(1) return result else: return ""
def _contact_mail(src): if src is None or src=='': return '' src=common.strQ2B(src) mail_v=u"(.*)\u79FB\u52A8" matchObj = re.compile(mail_v) m = matchObj.match(src) if m is not None: src = m.group(1) # print src reobj = re.compile(u'[\u3002]',re.IGNORECASE) result = reobj.sub(r'.', src)#替换。为. reobj = re.compile(u'[^\_\-\u002E\u0040a-zA-Z\d]',re.IGNORECASE) result = reobj.sub(r'', result)#替换非.整形@字符类数据 result=result.replace("www.","") # print result mail_v=u"(.*)\u0040(\w+)\u002E(.*)" matchObj = re.compile(mail_v) m = matchObj.match(result) if m is not None: name = m.group(1) com = m.group(3) yu = m.group(2) reobj = re.compile(u'[^\w\d\_\-]',re.IGNORECASE) name = reobj.sub(r'', name)#替换非.整形@字符类数据 reobj = re.compile(u'[^\w\d\u002E]',re.IGNORECASE) com = reobj.sub(r'', com)#替换非.整形.字符类数据 result=name+"@"+yu+"."+com return result else: return ""
def _contact_mail(src): if src is None or src == '': return '' src = common.strQ2B(src) mail_v = u"(.*)\u79FB\u52A8" matchObj = re.compile(mail_v) m = matchObj.match(src) if m is not None: src = m.group(1) # print src reobj = re.compile(u'[\u3002]', re.IGNORECASE) result = reobj.sub(r'.', src) #替换。为. reobj = re.compile(u'[^\_\-\u002E\u0040a-zA-Z\d]', re.IGNORECASE) result = reobj.sub(r'', result) #替换非.整形@字符类数据 result = result.replace("www.", "") # print result mail_v = u"(.*)\u0040(\w+)\u002E(.*)" matchObj = re.compile(mail_v) m = matchObj.match(result) if m is not None: name = m.group(1) com = m.group(3) yu = m.group(2) reobj = re.compile(u'[^\w\d\_\-]', re.IGNORECASE) name = reobj.sub(r'', name) #替换非.整形@字符类数据 reobj = re.compile(u'[^\w\d\u002E]', re.IGNORECASE) com = reobj.sub(r'', com) #替换非.整形.字符类数据 result = name + "@" + yu + "." + com return result else: return ""
def _contact(src,spider): """ 联系人处理,除去非中文字符 """ if src is None or src=='': return '' src=common.strQ2B(src) contact_v = re.compile(u'([\u4e00-\u9fa5a]+)',re.IGNORECASE) matchObj = re.compile(contact_v) m = matchObj.match(src) result="" if m is not None: result = m.group(1) if spider=="iautos": old_result=result[:2] # print old_result if result.count(old_result)>1: result=result[:len(result)/2] return result
def _contact(src, spider): """ 联系人处理,除去非中文字符 """ if src is None or src == '': return '' src = common.strQ2B(src) contact_v = re.compile(u'([\u4e00-\u9fa5a]+)', re.IGNORECASE) matchObj = re.compile(contact_v) m = matchObj.match(src) result = "" if m is not None: result = m.group(1) if spider == "iautos": old_result = result[:2] # print old_result if result.count(old_result) > 1: result = result[:len(result) / 2] return result
def _contact_mobile_phone(src,type,spider): phone="" mobile="" # print src # src=u"024-88886588" # print common.unicoding("-") if src is None or src=='': return "","" if spider=="iautos": old_src=src[:8] # print old_src # print src if src.count(old_src)>1: src=src[:len(src)/2] # print src if (not src.startswith("http")) and (src.find("/plugs/pic/")>=0 or src.find("/plugs/phone/")>=0): src="http://www.sc2car.com%s"%src if (not src.startswith("http")) and src.find("/tel")>=0: src="http://www.ganji.com%s"%src if src.count("/tel/")>1: number= src.rfind("/tel/") src=src[:number] if (not src.startswith("http")) and src.find("systems/codeimage")>=0: src="http://www.2duche.com%s"%src if (not src.startswith("http")) and src.find("upload/htmlImage")>=0: src="http://www.ln2car.com%s"%src if src.count("/")>=2: if type==1: return src,"" else: return "",src src=common.strQ2B(src) reobj = re.compile(u'[^0-9\u002D]',re.IGNORECASE) result = reobj.sub(r' ', src) # print result phone="" mobile="" if result.count(" ")>0: results= result.split(" ") for result1 in results: if len(result1)==11: mobile=result1 elif len(result1)>8: phone=result1 if phone!="" or mobile!="": return phone,mobile if result.startswith(u"400"): phone=result[:12] else: result=re.sub(u"\u0020", "",result) phone_v = u'(\d{8}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'.*(\d{4}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'.*(\d{3}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'(0\d+)' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: if len(result)>=6 and len(result)<=10: phone=result mobile_v=u"([1][0-9]{10})" matchObj = re.compile(mobile_v) m = matchObj.match(result) if m is not None: mobile=m.group(1) if mobile!="": phone=mobile return phone,mobile
def file_seg_process(filename, method): ''' @param filename: 文件名 @param method: 分词算法 { 0:正向,1:逆向 } ''' # 打开文件 fp_dict = open('dict.txt') fp_input = open('corpus/'+filename) fp_output = open('corpus_seg/'+filename, 'w') wordDict = {} # 读取字典到内存中 for eachWord in fp_dict: wordDict[u(eachWord.split()[0].strip(), CODEC)] = 1 # 对input每一行操作 str = '' for eachLine in fp_input: line_out = '' # 每一段作为一行输入给分词函数 sub = strQ2B(u(eachLine.strip(), CODEC)) if not sub.startswith(' '): str += sub continue strlen = len(str) while strlen > 0: # 英文字符或数字--原文输出 m = re.match(r'\w+', str) if m is not None: subStr = m.group() line_out += subStr.encode(CODEC)+'/' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen continue # 短句结尾标志--输出换行 if str[0:1].encode(CODEC) in [',','。','!','?',':']: subStr = str[0:1] line_out += '\n' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen # 汉字--分词处理,输出 词/词 m = re.match(ur'[\u4e00-\u9fa5]+', str) if m is not None: subStr = m.group() if method == 0: # 正向最大匹配 wordList = fwd_mm_seg(wordDict, 8, subStr) else: # 逆向最大匹配 wordList = bwd_mm_seg(wordDict, 8, subStr) line_out += wordList[0].encode(CODEC)+'/' for eachWord in wordList[1:]: line_out += eachWord.encode(CODEC)+'/' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen continue # 其他特殊字符--跳过 str = str[1:] strlen = strlen - 1 # 跳过处理后为空行的段落 if len(line_out.strip()) == 0: continue # 写入文件 fp_output.write(line_out + '\n') str = sub # close file fp_input.close() fp_dict.close() fp_output.close()
def _contact_mobile_phone(src, type, spider): phone = "" mobile = "" # print src # src=u"024-88886588" # print common.unicoding("-") if src is None or src == '': return "", "" if spider == "iautos": old_src = src[:8] # print old_src # print src if src.count(old_src) > 1: src = src[:len(src) / 2] # print src if (not src.startswith("http")) and (src.find("/plugs/pic/") >= 0 or src.find("/plugs/phone/") >= 0): src = "http://www.sc2car.com%s" % src if (not src.startswith("http")) and src.find("/tel") >= 0: src = "http://www.ganji.com%s" % src if src.count("/tel/") > 1: number = src.rfind("/tel/") src = src[:number] if (not src.startswith("http")) and src.find("systems/codeimage") >= 0: src = "http://www.2duche.com%s" % src if (not src.startswith("http")) and src.find("upload/htmlImage") >= 0: src = "http://www.ln2car.com%s" % src if src.count("/") >= 2: if type == 1: return src, "" else: return "", src src = common.strQ2B(src) reobj = re.compile(u'[^0-9\u002D]', re.IGNORECASE) result = reobj.sub(r' ', src) # print result phone = "" mobile = "" if result.count(" ") > 0: results = result.split(" ") for result1 in results: if len(result1) == 11: mobile = result1 elif len(result1) > 8: phone = result1 if phone != "" or mobile != "": return phone, mobile if result.startswith(u"400"): phone = result[:12] else: result = re.sub(u"\u0020", "", result) phone_v = u'(\d{8}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'.*(\d{4}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'.*(\d{3}\u002D\d+)$' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: phone_v = u'(0\d+)' #0757-83966080 matchObj = re.compile(phone_v) m = matchObj.match(result) if m is not None: phone = m.group(1) else: if len(result) >= 6 and len(result) <= 10: phone = result mobile_v = u"([1][0-9]{10})" matchObj = re.compile(mobile_v) m = matchObj.match(result) if m is not None: mobile = m.group(1) if mobile != "": phone = mobile return phone, mobile