f.close() while True : url = url_base%(start, author.encode('utf-8')) print url source = urllib.urlopen(url, proxies=p) s = source.read() items = tl.get_items(s) if len(items) == 0 : break cpt_i = 0 for i in items : cpt_i += 1 authors = tl.get_author(i) if len(authors) != 1 : continue if unicode(authors[0], 'utf-8') != author : continue url_article = tl.get_url(i) # if json_author.has_key(url_article) : if url_article in json_author : continue print url_article published = tl.get_time(i)
while True: url = url_base % (start, author_unicode.encode('utf-8')) print author_unicode, '::', url source = urllib.urlopen(url, proxies=p) s = source.read() items = tl.get_items(s) if len(items) == 0: break cpt_i = 0 for i in items: cpt_i += 1 authors = tl.get_author(i) if len(authors) != 1: continue if unicode(authors[0], 'utf-8') != author_unicode: continue url_article = tl.get_url(i) # if json_author.has_key(url_article) : if (url_article in json_author) or (url_article in json_author_new): print ' [already storaged]' continue print author_unicode, '::', url_article
f = open(json_author_path) json_loaded = json.load(f) f.close() cpt = 10000 while True: url = "http://www.liberation.fr/recherche/?page=%s&q=%s" % (start, q) source = urllib.urlopen(url, proxies=p) s = source.read() re_item = "<li>\s*?<time[^>]*?>.*?</li>" re_item_compile = re.compile(re_item, re.DOTALL | re.U) f = re_item_compile.findall(s) if len(f) == 0: break for i in f: for a in tl.get_author(i): json_loaded[a] = "" print len(json_loaded.keys()) start += 1 f = open(json_author_path, "w") json.dump(json_loaded, f) f.close() r = random.uniform(1, 7) time.sleep(r) cpt -= 1 if cpt == 0: break
f = open(json_author_path) json_loaded = json.load(f) f.close() cpt = 10000 while True: url = 'http://www.liberation.fr/recherche/?page=%s&q=%s' % (start, q) source = urllib.urlopen(url, proxies=p) s = source.read() re_item = '<li>\s*?<time[^>]*?>.*?</li>' re_item_compile = re.compile(re_item, re.DOTALL | re.U) f = re_item_compile.findall(s) if len(f) == 0: break for i in f: for a in tl.get_author(i): json_loaded[a] = '' print len(json_loaded.keys()) start += 1 f = open(json_author_path, 'w') json.dump(json_loaded, f) f.close() r = random.uniform(1, 7) time.sleep(r) cpt -= 1 if cpt == 0: break