def seach(urllist): def process_keyurl(keyurl): if keyurl is not None: for key, urllist in keyurl.iteritems(): for url in urllist: urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl1 = urlinfo if list is not None: gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()}) if keyurl1 is not None: process_keyurl(keyurl1) def process_urllist(url_list): for url in url_list: #print url,"sub url" urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl = urlinfo if list is not None: process_urllist(list) if keyurl is not None: process_keyurl(keyurl) time.sleep(0.1) suburl = [] subkeyurl = {} for url in urllist: print url, "root url" urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl = urlinfo suburl.extend(list) subkeyurl.update(keyurl) try: process_urllist(suburl) process_keyurl(subkeyurl) except: import traceback traceback.print_exc()
def process_keyurl(keyurl): if keyurl is not None: for key1, urllist in keyurl.iteritems(): for url in urllist: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None: continue if isinstance(urlinfo, str) and urlinfo == "url is be processed": key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True) else: list, keyurl1 = urlinfo key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True) if keyurl1 is not None: process_keyurl(keyurl1)
def process_urllist(url_list): for url in url_list: urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl = urlinfo if list is not None: for url in list: gethtml.process_url(url) process_keyurl(keyurl) time.sleep(0.1)
def process_keyurl(keyurl): if keyurl is not None: for key1, urllist in keyurl.iteritems(): for url in urllist: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None: continue if isinstance(urlinfo, str) and urlinfo == "url is be processed": key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({ 'key': key, 'url': url }, { '$set': { 'key': key, 'url': url, 'timetmp': time.time() } }, True) else: list, keyurl1 = urlinfo key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({ 'key': key, 'url': url }, { '$set': { 'key': key, 'url': url, 'timetmp': time.time() } }, True) if keyurl1 is not None: process_keyurl(keyurl1)
def process_urllist(url_list): for url in url_list: #print url,"sub url" urlinfo = gethtml.process_url(url) if urlinfo is None: return list, keyurl = urlinfo process_keyurl(keyurl) time.sleep(0.1)
def process_keyurl(keyurl): if keyurl is not None: for key, urllist in keyurl.iteritems(): for url in urllist: urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl1 = urlinfo if list is not None: gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()})
def process_keyurl(keyurl): if keyurl is not None: for key, urllist in keyurl.iteritems(): for url in urllist: urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl1 = urlinfo if list is not None: gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()}) if keyurl1 is not None: process_keyurl(keyurl1)
def process_urllist(url_list): for url in url_list: #print url,"sub url" urlinfo = gethtml.process_url(url) if urlinfo is None: continue list, keyurl = urlinfo if list is not None: process_urllist(list) if keyurl is not None: process_keyurl(keyurl) time.sleep(0.1)
def process_urllist(url_list): for url in url_list: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None or isinstance(urlinfo, str): continue list, keyurl = urlinfo if list is not None: process_urllist(list) if keyurl is not None: process_keyurl(keyurl) time.sleep(0.1)
def seach(urllist): gethtml.process_url_list = [] def process_keyurl(keyurl): if keyurl is not None: for key1, urllist in keyurl.iteritems(): for url in urllist: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None: continue if isinstance(urlinfo, str) and urlinfo == "url is be processed": key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({ 'key': key, 'url': url }, { '$set': { 'key': key, 'url': url, 'timetmp': time.time() } }, True) else: list, keyurl1 = urlinfo key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({ 'key': key, 'url': url }, { '$set': { 'key': key, 'url': url, 'timetmp': time.time() } }, True) if keyurl1 is not None: process_keyurl(keyurl1) def process_urllist(url_list): for url in url_list: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None or isinstance(urlinfo, str): continue list, keyurl = urlinfo if list is not None: process_urllist(list) if keyurl is not None: process_keyurl(keyurl) time.sleep(0.1) for url in urllist: print url, "root url" urlinfo = gethtml.process_url(url) if urlinfo is None or isinstance(urlinfo, str): print "error root url", url continue list, keyurl = urlinfo #print list #print keyurl try: process_urllist(list) process_keyurl(keyurl) except: import traceback traceback.print_exc()
def seach(urllist): gethtml.process_url_list = [] def process_keyurl(keyurl): if keyurl is not None: for key1, urllist in keyurl.iteritems(): for url in urllist: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None: continue if isinstance(urlinfo, str) and urlinfo == "url is be processed": key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True) else: list, keyurl1 = urlinfo key = "" for c in key1: if c >= 'A' and c <= 'Z': c = c.lower() key += c gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True) if keyurl1 is not None: process_keyurl(keyurl1) def process_urllist(url_list): for url in url_list: #print url if url[len(url) - 1] == '/': url = url[0:-1] #print url #print url urlinfo = gethtml.process_url(url) if urlinfo is None or isinstance(urlinfo, str): continue list, keyurl = urlinfo if list is not None: process_urllist(list) if keyurl is not None: process_keyurl(keyurl) time.sleep(0.1) for url in urllist: print url, "root url" urlinfo = gethtml.process_url(url) if urlinfo is None or isinstance(urlinfo, str): print "error root url",url continue list, keyurl = urlinfo #print list #print keyurl try: process_urllist(list) process_keyurl(keyurl) except: import traceback traceback.print_exc()