""" import sys import httplib import re import fetcher #url -- proceeding page url url = "http://dblp.uni-trier.de/db/conf/acl/acl2011.html" if (len(sys.argv) > 1): url = sys.argv[1] index = 0 contents = fetcher.fetch_webpage(url) paper_links = re.findall('<br><a href="(.*?)"><img alt="Electronic Edition"', contents) paper_names = re.findall('<b>(.*?)[\.|\?]', contents) if len(paper_names) != len(paper_links): print "fetch paper names & links error!" else: for paper_link in paper_links : if paper_link.endswith(".pdf") : d_link = paper_link else : real_path = re.findall('<a href="(.*?)"', fetcher.fetch_webpage(paper_link)) if len(real_path) == 0 : index = index + 1 continue d_link = real_path[0]
if (len(sys.argv) > 1): pNo = sys.argv[1] if (len(sys.argv) > 2): size = int(sys.argv[2]) detail_list_url = "http://acm.lib.tsinghua.edu.cn/acm/Detail-List.nsp?&view=ACM&cid_PCODE=&cid_DOCTYPE=&cid_HASABSTRACT=&cid_HASFULLTEXT=&lastquery=(pNo):PROC_ID&sortfield=SECTION_SEQ_NO,SEQ_NO,PUBDATE&sortorder=ASCENDING,ASCENDING,ASCENDING&var_AUTHCODE=&var_PUBCODE=&var_BROWSECODE=&var_SOURCECODE=&recid=&reccode=&mailto=&docindex=iNo&var_SECTION=&numresults=25&fromrecord=&usertag=" content_url = "http://166.111.120.94/acm/ContentLoader.nsp?view=path" file_url = "http://166.111.120.94/acm/path" detail_list_url = detail_list_url.replace("pNo", pNo, 1) for i in range(0, size) : contents = fetcher.fetch_webpage(detail_list_url.replace("iNo", str(i), 1)) paper_name = re.findall('<b><img src="img/spacer.gif"><br>(.*?)</b>', contents); #print paper_name paper_path = re.findall('fl = "(.*?)"', contents) p_len = len(paper_path) index = 0 while index < p_len: if not paper_path[index].endswith(".pdf") : del paper_path[index] p_len = p_len - 1 else : index = index + 1 if len(paper_path) == 0 : continue real_path = re.findall('top.location.replace\("(.*?)"', fetcher.fetch_webpage(content_url.replace("path", paper_path[0], 1)))