def _tester(): courses = WebCrawler() for course in courses: if (course.TBA()): courses.remove(course) for i in range(40): l = random.sample(courses, 10) tables = _allPossibleTimeTables(l)
def get_image(self, keyword): soup = WebCrawler().get_soup( "https://www.google.co.kr/search?hl=en&tbm=isch&q=%s" % keyword) try: info = soup.find_all("img") # 구글 자체 이미지가 포함되어 있기 때문에 1부터 시작한다 index = random.randint(1, len(info)) return info[index]["src"] # 이미지의 링크를 가져옴 except: print("[오류] GSM Bot이 이미지를 가져올 수 없습니다.") return None
def main(): wc = WebCrawler() wc.setDriverPath('chromedriver') wc.createDriver() searchTerm = 'irritable+bowel+disease' urlSequence = ('https://clinicaltrials.gov/ct2/results?cond=', searchTerm) url = ''.join(map(str, urlSequence)) if wc.connectToURL(url, 'tab-body'): sleep(2) html = wc.getDriver().page_source else: print('Error connecting to URL') wc.killDriver()
def diff_show_post_lists(self): show_file = open('showlist.csv', 'rb') post_file = open('postlist.csv', 'rb') last_show = show_file.readlines() show_list = [l.rstrip() for l in last_show] last_post = post_file.readlines() post_list = [l.rstrip() for l in last_post] for show in show_list: if show not in post_list: curr_show = show.split(', ') crawler = WebCrawler() bot = RedditBot('dcibottest') recaps = crawler.get_show_recap_url(curr_show[2]) shows = [crawler.parse_recap_table_2016(r) for r in recaps] bodies = [bot.parse_show_to_post_2016(s) for s in shows] single_body = bot.get_header(shows[0]) single_body += '\n\n'.join(bodies) single_body += bot.get_legend() single_body += bot.get_footer() with open('redditoutput.text', 'wb') as f: f.write(single_body) time_obj = strptime(curr_show[1][:10], '%Y-%m-%d') time_str = strftime('%m/%d/%y', time_obj) post_title = time_str + ' Scores: ' + curr_show[0] # bot.post_thread(post_title, single_body) with open('logging.txt', 'ab') as log: print 'new show ' + curr_show[0] log.write('added new show on {0}\n'.format(datetime.now())) log.write(show + '\n') # sleep(60) # reddit post timeout? with open('postlist.csv', 'wb') as p: p.write('\n'.join(show_list))
def get_calendar(self, dump): today = datetime.today() soup = WebCrawler().get_soup( "http://www.gsm.hs.kr/xboard/board.php?tbnum=4") try: info = soup.select("#xb_fm_list > div.calendar > ul > li > dl") result = "```" for i in info: if not i.find("dd") == None: text = i.text.replace("\n", "") result += "%6s -%s\n" % (text.split("-")[0], text.split("-")[1]) for i in text.split("-")[2:]: result += "%7s -%s\n" % ("", i) result += "```" return result except AttributeError: print("[오류] GSM Bot이 학사일정을 불러올 수 없습니다.") return "%s년 %s월 학사일정을 불러올 수 없습니다." % (today.year, today.month)
def get_hungry(self, dump): today = self.get_nextDay() nextMeal = self.get_nextMeal(today) item = ["아침", "점심", "저녁"] soup = WebCrawler().get_soup( "http://www.gsm.hs.kr/xboard/board.php?tbnum=8&sYear=%s&sMonth=%s" % (today.year, today.month)) try: info = soup.select( "#xb_fm_list > div.calendar > ul > li > div > div.slider_food_list.slider_food%s.cycle-slideshow" % today.day) menuList = (info[0].find( "div", { "data-cycle-pager-template": "<a href=#none; class=today_food_on%s title=%s></a>" % (nextMeal % 3 + 1, item[nextMeal % 3]) }).find("span", "content").text).split("\n") p = re.compile("(?!에너지)[가-힣]+") # 영양성분 문장을 제외하기 위한 정규표현식 result = "" for i in menuList: if p.match(i.split()[0]): result += ("- " + i.split()[0] + "\n") # result의 길이가 0이면 if not len(result): raise Exception return result except: print("[오류] GSM Bot이 식단표를 받아올 수 없습니다.") return "%s 급식을 불러올 수 없습니다." % item[nextMeal % 3]
def executeCommand(webCrawler, command): commandParts = [part for part in command.split(' ')] if (commandParts[0] == 'Start'): if (webCrawler != None): webCrawler.processesShutDown() webCrawler = WebCrawler(commandParts[1], commandParts[2].split(',')) webCrawler.processesEngage() if (commandParts[0] == 'Stop'): if (webCrawler != None): webCrawler.processesShutDown() return webCrawler
class Test(unittest.TestCase): def setUp(self): self.crawler = WebCrawler() def testWebCrawler(self): page = ( '<div id="top_bin"><div id="top_content" class="width960">' '<div class="udacity float-left"><a href="http://udacity.com">') self.crawler.findUrlsInPage(page) self.crawler.findUrlsInPage( '<a href="http://udacity.com">Hello world</a>') pass def testWebCrawlerFile(self): # filename = "/Users/faraz/eclipse/python_workspace/hellopythonworld/udacity-source.htm" filename = "../udacity-source.htm" self.crawler.findUrlsInFile(filename)
class GameBase: HOST = "http://www.juxiangyou.com/" LOGIN_INDEX_URL = HOST + "fun/play/crazy28/index" VERIFY_URL = HOST + "verify" LOGIN_POST_URL = HOST + "login/auth" VERIFY_CODE_FILE_PATH = "/Img/verifyCode.png" LOGIN_CODE_SUCCEED = 10000 LOAD_PAGES = 50 CHECK_INTERVAL = 20 webCrawler = WebCrawler() def __init__(self, is_auto_fire=False): self.dbHelper = DBHelper() """最近开的期""" self.latestRound = None """正在进去期""" self.runningRound = None self.count_zhong = 0 self.count_bian = 0 self.count_xiao_bian = 0 self.count_da_bian = 0 self.is_internal_logged = False self.rules = [] if is_auto_fire: self.rules.append(XiaoBianRule(self)) self.rules.append(ZhongRule(self)) self.rules.append(DaBianRule(self)) self.rules.append(BianRule(self)) self.rules.append(DanRule(self)) self.rules.append(ShuangRule(self)) self.rules.append(XiaoRule(self)) self.rules.append(DaRule(self)) # if GameBase.webCrawler is None: # GameBase.webCrawler = WebCrawler() def get_http(self): return GameBase.webCrawler def get_header(self): return GameBase.get_static_header() @staticmethod def get_static_header(): headers = {"Host": "www.juxiangyou.com", "Referer": "http://www.juxiangyou.com/", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2914.3 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Upgrade-Insecure-Requests": "1", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Accept-Encoding": "gzip, deflate, sdch"} return headers @staticmethod def get_verify_code(): r = GameBase.webCrawler.get(GameBase.LOGIN_INDEX_URL, GameBase.get_static_header()) if "游戏期号" in r.text: GameBase.close_request(r) return True r.close() r = GameBase.webCrawler.get(GameBase.VERIFY_URL, GameBase.get_static_header()) verify_img = os.path.curdir + GameBase.VERIFY_CODE_FILE_PATH if r.status_code == 200: # if os.path.exists(verify_img): # os.remove(verify_img) with open(verify_img, 'wb+') as f: for block in r.iter_content(1024): f.write(block) Logger.info("当前验证码路径:{}".format(os.path.abspath(verify_img))) GameBase.close_request(r) return False @staticmethod def close_request(r): if r is not None: r.close @staticmethod def login(user, pwd, verify_code): data = ( "jxy_parameter=%7B%22c%22%3A%22index%22%2C%22fun%22%3A%22login%22%2C%22account%22%3A%22{}%22%2C%22password" + \ "%22%3A%22{}%22%2C%22verificat_code%22%3A%22{}%22%2C%22is_auto%22%3Atrue%7D").format( user, pwd, verify_code) header = GameBase.get_static_header() header["Referer"] = "http://www.juxiangyou.com/login/index?redirectUrl=/fun/play/crazy28/index" r = GameBase.webCrawler.post(GameBase.LOGIN_POST_URL, data, header) Logger.info(r.text) a = r.json()["code"] GameBase.close_request(r) return GameBase.LOGIN_CODE_SUCCEED == a @staticmethod def login_action(): is_login = GameBase.get_verify_code() if not is_login: code = input("请录入登录信息,格式 用户名 密码 验证码: ") user, pwd, code = code.split() is_login = GameBase.login(user, pwd, code) if not is_login: Logger.info("用户登录失败,请检查录入是否出错") return is_login @staticmethod def get_color_red(str): if str is not None: return colored(str, "red") return str @staticmethod def get_color_green(str): if str is not None: return colored(str, "green") return str def get_game_url(self): pass def get_table_name(self): pass def get_game_name(self): pass def get_rounds(self): # if not GameBase.login_action(): # return return self.get_pages(GameBase.LOAD_PAGES) def get_pages(self, page_num): result = False table_name = self.get_table_name() game_name = self.get_game_name() max_round = self.dbHelper.select_max_id(table_name) if self.runningRound is not None: if (datetime.datetime.now() - self.runningRound.date).seconds < GameBase.CHECK_INTERVAL: '''去除太多重复日志''' if not self.is_internal_logged: Logger.info("游戏:{0},当前期:{1} 还没有开奖,直接返回 {2}".format( GameBase.get_color_red(game_name), GameBase.get_color_red(self.runningRound.id), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) self.is_internal_logged = True return result self.is_internal_logged = False is_end = False latest_round = None running_round = None for page in range(page_num): url = ("http://www.juxiangyou.com/fun/play/interaction/?jxy_parameter=%7B%22c%22%3A%22quiz%22%2C%22" + \ "fun%22%3A%22getEachList%22%2C%22items%22%3A%22{}%22%2C%22pageSize%22%3A20%2C%22" + \ "pageIndex%22%3A{}%7D&xtpl=fun%2Fprivate%2Fjc-index-tbl¶ms%5Bitems%5D={}"). \ format(game_name, page + 1, game_name) r = GameBase.webCrawler.get(url, self.get_header()) try: json = r.json() except Exception as e: if r is None: Logger.error("JSON 解析出错:{0},JSON为空没法获取".format(e)) else: Logger.error("JSON 解析出错:{0},{1}".format(e, r.text)) continue finally: GameBase.close_request(r) rounds = [] if json is None or "itemList" not in json: continue # 数据是按时间倒序 for item in json["itemList"]: num = int(item["num"]) temp_round = RoundModel(int(item["num"]), "{0}-{1}".format(datetime.datetime.now().year, item["date"]), item["jcjg2"], int(item["jing"]), int(item["shou"])) # 如果有多于一条历史记录时,这个会成为最早的记录,并非最近结束期号 if page == 0: if item["iskj"]: if latest_round is None: latest_round = temp_round else: running_round = temp_round if item["iskj"]: if num <= max_round: is_end = True # Logger.info("开奖期号遍历结束,当前最新期号:{0}-{1}".format(max_round, game_name)) break else: rounds.append( [num, "{0}-{1}".format(datetime.datetime.now().year, item["date"]), item["jcjg2"]]) if len(rounds) > 0: self.dbHelper.insert(table_name, rounds) if len(rounds) == 1 and latest_round is not None: win_result = latest_round.shou - latest_round.jing item_str = "[{0},{1},{2}]".format(GameBase.get_color_red(latest_round.id), latest_round.date, GameBase.get_color_red(latest_round.value)) if win_result > 0: Logger.info( "历史数据 {0} 值:{1}, U豆 **** 投:{2}, 赚:{3} ****".format(GameBase.get_color_red(game_name), item_str, GameBase.get_color_red(latest_round.jing), GameBase.get_color_green(win_result))) else: Logger.info( "历史数据 {0} 值:{1}, U豆 **** 投:{2}, 赚:{3} ****".format(GameBase.get_color_red(game_name), item_str, GameBase.get_color_red(latest_round.jing), GameBase.get_color_red(win_result))) else: Logger.info("{0} - 历史数据 {1}:{2}条".format(datetime.datetime.now(), game_name, len(rounds))) if is_end: break self.latestRound = latest_round if self.runningRound is not None and running_round is not None and \ self.runningRound.id == running_round.id: result = False else: result = True self.runningRound = running_round return result def get_rows(self, html): """ <tr> <td>589161</td> <td>01-16 09:41</td> <td class="num"> 0+1+5=<span class="ball-num">6</span> </td> <td> 8,617,616,861<span class="udou"></span> </td> <td style="color:#ff4c4c"> <a style="color:#ff4c4c" class="win-list" href="/fun/play/crazy28/zjrs?id=589161">359</a> </td> <td class="st-td"> <span class="udou"></span> <span class="shou kui">收:0</span><br /> <span class="jing">竞:0</span> </td> <td> <span class="yikai">已开奖</span> </td> </tr> :param html: html get from crazy28 index page :return: 每条记录:期号,时间,数值 """ bs = bs4.BeautifulSoup(html, "lxml") result = [] for row in bs.table.children: if row == "\n": continue item = [] i = 0 for td in row.children: if td == "\n": continue if i == 0: if td.text.isdigit(): id = int(td.text) if id <= self.max_round: return result item.append(id) else: break elif i == 1: item.append("{}-{}".format(datetime.datetime.now().year, td.text)) pass elif i == 2 and "class" in td.attrs and td.attrs["class"][0] == 'num' \ and len(td.contents) == 3: span = td.contents[1] if span != "\n" and "class" in span.attrs and span.attrs["class"][0] == "ball-num" and \ span.text.isdigit(): item.append(int(span.text)) break else: item = [] i += 1 if len(item) > 0: result.append(item) return result def post_next_round(self): for item in self.rules: if item.start(): time.sleep(3) pass
def main(): crawler = WebCrawler() test = False done = False if test: crawler.test() else: while (not done): print("Webcrawler Options") print("1. Crawl Web URL") print("2. Search Crawled Sites") print("3. View names of URL's Crawled") print("4. View words crawled") print("5. Delete databases") print("6. View Errors") print("7. View Stats") print("8. Exit") option = input("Select number: ") if (option == "1"): url = input("URL: ") depth = int(input("Depth: ")) crawler.crawlURL(url, depth) elif (option == "2"): word = input("Search for links that contain the word: ") word = word.split() word = word[0] word = word.lower() crawler.searchWords(word) elif (option == "3"): crawler.printURLS() elif (option == "4"): crawler.printWords() elif (option == '5'): crawler.delete() elif (option == '6'): crawler.printErrors() elif (option == '7'): crawler.printStats() elif (option == '8'): print("Exiting WebCralwer, Goodbye...") exit() elif (option == "9"): crawler.printDoubles() else: print("Invalid Input, try again with a number.")
for o, a in OPTS: if o in ("-d", "-depth"): DEPTH = int(a) elif o in ("-o", "-outside"): GO_OUTSIDE = True elif o in ("-u", "-output"): OUTPUT = a print("aaaaa : ", a) else: print "Error : -d for depth, -o for go_outside, -u for output" if GO_OUTSIDE == '': GO_OUTSIDE = False if DEPTH == '': DEPTH = 2 if OUTPUT == '': OUTPUT = "results" CRAWLER = WebCrawler(URL, DEPTH, GO_OUTSIDE, OUTPUT) CRAWLER.crawl() #print("Dictionary : ", CRAWLER.dictionary) print "Save the crawling ? (y/[n]) : " if raw_input() == 'y': CRAWLER.save() print "Charge the crawling ? (y/[n]) : " if raw_input() == 'y': print("Dictionary: ", CRAWLER.load())
import logging from WebCrawler import WebCrawler import controller sgxCrawler = WebCrawler() config = controller.configurations() logging.basicConfig(filename='logOutput.log', level=config.logging_level) def menuOptions(menu): if (menu == 'main'): print('{} Download SGX Data'.format(1)) print('{} Get Failed Downloads'.format(2)) print('{} Initiate Automation Script'.format(3)) print('{} Show Configurations '.format(4)) print('{} Help'.format(5)) print('{} Exit'.format(0)) ASK_OPTION = 'Please Select an option: ' if __name__ == '__main__': options = 1 while (options != -0): print('Welcome to SGX Derivatives Downloader \n') menuOptions('main') try:
def setUp(self): self.crawler = WebCrawler()
from WebCrawler import WebCrawler startLink = raw_input("Please enter a starting web address: ") keyword = raw_input("Please enter a keyword to search for: ") crawler = WebCrawler(keyword, startLink) while True: print "Getting a web page, please wait: ", crawler.currentWebAddress crawler.getCurrentPage() if crawler.findKeyword(): break crawler.nextPage() print "Keyword found on the following page:", crawler.currentWebAddress
self.automated_scheduled_time = configParser.get( 'Head', 'AUTOMATED_SCHEDULED_TIME') def printConfigurations(self): print('________Configurations________') print('LOGGING LEVEL: {}'.format(self.logging_level)) print('AUTO SCHEDULE TIME: {}'.format(self.automated_scheduled_time)) print('LOWER LIMIT FOR AUTOMATED DAYS: {}'.format( self.automated_days_download)) print('RETRY_AUTOMATED_DOWNLOADS: {}'.format( self.retry_Automated_downloads)) print('NUMBER_RETRY: {}'.format(self.number_retry)) config = configurations() sgxCrawler = WebCrawler() def AutoMode(): print('Fetching Data Automatically') schedule.every().day.at(config.automated_scheduled_time).do(AutoUpdate) while True: try: schedule.run_pending() sys.stdout.write('\r') sys.stdout.write('automation standby .') time.sleep(1) sys.stdout.write('\r') sys.stdout.write('automation standby ..') time.sleep(1) sys.stdout.write('\r')
from WebCrawler import WebCrawler from IndexEngine import IndexEngine if __name__ == '__main__': webCrawler = WebCrawler(10, engine=IndexEngine()) webCrawler.addSeeds('http://www.gnulinuxmag.com', 'http://www.linux-pratique.com') webCrawler.start()
if __name__ == "__main__": try: OPTS, ARGS = getopt.getopt(sys.argv[1:], "i:k:", ["input=", "keyword="]) except getopt.GetoptError as err: print err sys.exit(2) for o, a in OPTS: if o in ("-i", "-input"): INPUT = a elif o in ("-k", "-keyword"): KEYWORD = a else: print "Error : -i for input, -k for keyword" CRAWLER = WebCrawler(URL, null, null, null, INPUT, KEYWORD) CRAWLER.load() print("Dictionary : ", CRAWLER.dictionary) print "Save the crawling ? (o/[n]) : " if raw_input() == 'o': print "Folder : " CRAWLER.save() print "Charge the crawling ? (o/[n]) : " if raw_input() == 'o': print "Folder : " CRAWLER.load()
def main(): web_crawler = WebCrawler('Google', 'http://www.google.com') web_crawler.run()
from WebCrawler import WebCrawler # Press the green button in the gutter to run the script. if __name__ == '__main__': landmark = '白馬塔' searchEngine = 'Bing' if searchEngine == '百度': # 百度 url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' \ + landmark xpath = '//div[@id="imgid"]/div/ul/li/div/a/img' elif searchEngine == '搜狐': # 搜狐 url = 'https://pic.sogou.com/pics?query=' + landmark + '&di=2&_asf=pic.sogou.com&w=05009900' xpath = '//div[@class="figure-result"]/ul/li/div/a/img' elif searchEngine == 'Google': # Google url = 'https://www.google.com.tw/search?q=' + landmark + '&tbm=isch&hl=zh-TW&tbs&sa=X&ved=0CAEQpwVqFwoTCKj3nbCZm-0CFQAAAAAdAAAAABAC&biw=1279&bih=977' xpath = '//img[@class="rg_i Q4LuWd"]' else: # Bing url = 'https://www.bing.com/images/search?q=' + landmark + '&form=HDRSC2&first=1&tsc=ImageBasicHover&scenario=ImageBasicHover' xpath = '//*[@id="mmComponent_images_2"]/ul/li/div/div/a/div/img' var = WebCrawler(url, xpath, landmark)
def setUp(self): self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/Robots_exclusion_standard")
def testParstInternetArchive(self): self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/internet_archive") self.spider.getCurrentPage() self.spider.findKeyword()
class TestWebCrawler(unittest.TestCase): def setUp(self): self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/Robots_exclusion_standard") def tearDown(self): del self.spider def testInstantiateKeyWord(self): self.assertEquals(self.spider.keyWord, "robot") def testInstantiateWebAddress(self): self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/Robots_exclusion_standard") def testGetCurrentPage(self): self.spider.getCurrentPage() self.assertEquals(self.spider.currentPage.getcode(), 200) def testFindKeyWord(self): self.spider.getCurrentPage() self.assertEquals(self.spider.findKeyword(), True) def testParseLinks(self): self.spider.getCurrentPage() self.spider.findKeyword() self.assertEquals(len(self.spider.links), 98) def testGetNextWebpage(self): self.spider.getCurrentPage() self.spider.findKeyword() self.spider.nextPage() self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/MediaWiki:Robots.txt") self.assertEquals(len(self.spider.links), 97 ) self.spider.getCurrentPage() self.assertEquals(self.spider.currentPage.getcode(), 200) def testParstInternetArchive(self): self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/internet_archive") self.spider.getCurrentPage() self.spider.findKeyword() def testFollowRobotDotTxt(self): testLinks = list() testLinks.append("/wiki/Special:Search") testLinks.append("/wiki/computers") self.spider.links.extend(testLinks) self.spider.nextPage() self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers") def testDontParseDuplicatPage(self): testLinks = list() testLinks.append("/wiki/computers") testLinks.append("/wiki/computers") testLinks.append("/wiki/computers_hard_drives") self.spider.links.extend(testLinks) self.spider.nextPage() self.spider.nextPage() self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers_hard_drives") def testParseTheUnready(self): testLinks = list() testLinks.append("/wiki/%C3%86thelred_the_Unready") self.spider.links.extend(testLinks) self.spider.nextPage() self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/%C3%86thelred_the_Unready")
def getStockData(self, baseURL, endpoint, ticker, credentials, item): sequence = (baseURL, ticker, '&page=', endpoint, '&range=24') url = ''.join(map(str, sequence)) wc = WebCrawler() wc.setDriverPath('chromedriver') wc.createDriver() wc.briefingLogin([credentials[0], credentials[1]]) if wc.connectToURL(url): sleep(2) html = wc.getDriver().page_source else: raise Exception('Unable to connect to Briefing.com') wc.briefingLogout() wc.killDriver() return html
from lxml import html from WebCrawler import WebCrawler import os # This URL should point to the root directory page of the desired data dump. DUMP_URL = 'https://wikileaks.org/ciav7p1/cms/index.html' URL_PREFIX = '/'.join(DUMP_URL.split('/')[0:len(DUMP_URL.split('/')) - 1]) header = {'User-Agent': 'web-crawler'} wc = WebCrawler(header, 10) tree = wc.requestHTML(DUMP_URL) fileCategories = tree.xpath('//div[@id="uniquer"]//h3/text()') fileURLs = tree.xpath('//div[@id="uniquer"]/ul//table//td//div//a/@href') fileTitles = tree.xpath('//div[@id="uniquer"]/ul//table//td//div//a/text()') # Format file titles so they can be file names. for i in range(0, len(fileTitles)): fileTitles[i] = fileTitles[i].replace('/', '-') fileTitles[i] = fileTitles[i].replace('"', '') fileTitles[i] = fileTitles[i].replace('\'', '') skipExistingData = input( 'Would you like to skip scraping files that already exist in output?(y/n)') if skipExistingData == 'y': # Check to see if any of the files found on dump page have already been # scraped. pEF = os.listdir('output/') # pEF = preExistingFiles for i in range(0, len(fileTitles)):