Python WebCrawler Examples, WebCrawler.WebCrawler Python Examples

Example #1

0

Show file

File: timetable_generator.py Project: jwebbed/UofTSched

def _tester():
    courses = WebCrawler()
    for course in courses:
        if (course.TBA()):
            courses.remove(course)
    for i in range(40):
        l = random.sample(courses, 10)        
    tables = _allPossibleTimeTables(l)

Example #2

0

Show file

    def get_image(self, keyword):
        soup = WebCrawler().get_soup(
            "https://www.google.co.kr/search?hl=en&tbm=isch&q=%s" % keyword)

        try:
            info = soup.find_all("img")
            # 구글 자체 이미지가 포함되어 있기 때문에 1부터 시작한다
            index = random.randint(1, len(info))
            return info[index]["src"]  # 이미지의 링크를 가져옴

        except:
            print("[오류] GSM Bot이 이미지를 가져올 수 없습니다.")
            return None

Example #3

0

Show file

File: run.py Project: stevegbrooks/pharma-dash

def main():

    wc = WebCrawler()
    wc.setDriverPath('chromedriver')
    wc.createDriver()

    searchTerm = 'irritable+bowel+disease'

    urlSequence = ('https://clinicaltrials.gov/ct2/results?cond=', searchTerm)
    url = ''.join(map(str, urlSequence))

    if wc.connectToURL(url, 'tab-body'):
        sleep(2)
        html = wc.getDriver().page_source
    else:
        print('Error connecting to URL')

    wc.killDriver()

Example #4

0

Show file

File: ListManager.py Project: bradysalz/DCI-Scores-Bot

    def diff_show_post_lists(self):
        show_file = open('showlist.csv', 'rb')
        post_file = open('postlist.csv', 'rb')

        last_show = show_file.readlines()
        show_list = [l.rstrip() for l in last_show]

        last_post = post_file.readlines()
        post_list = [l.rstrip() for l in last_post]

        for show in show_list:
            if show not in post_list:
                curr_show = show.split(', ')
                crawler = WebCrawler()
                bot = RedditBot('dcibottest')

                recaps = crawler.get_show_recap_url(curr_show[2])
                shows = [crawler.parse_recap_table_2016(r) for r in recaps]
                bodies = [bot.parse_show_to_post_2016(s) for s in shows]

                single_body = bot.get_header(shows[0])
                single_body += '\n\n'.join(bodies)
                single_body += bot.get_legend()
                single_body += bot.get_footer()

                with open('redditoutput.text', 'wb') as f:
                    f.write(single_body)

                time_obj = strptime(curr_show[1][:10], '%Y-%m-%d')
                time_str = strftime('%m/%d/%y', time_obj)
                post_title = time_str + ' Scores: ' + curr_show[0]

#                bot.post_thread(post_title, single_body)

                with open('logging.txt', 'ab') as log:
                    print 'new show ' + curr_show[0]
                    log.write('added new show on {0}\n'.format(datetime.now()))
                    log.write(show + '\n')

                # sleep(60) # reddit post timeout?

        with open('postlist.csv', 'wb') as p:
            p.write('\n'.join(show_list))

Example #5

0

Show file

    def get_calendar(self, dump):
        today = datetime.today()

        soup = WebCrawler().get_soup(
            "http://www.gsm.hs.kr/xboard/board.php?tbnum=4")

        try:
            info = soup.select("#xb_fm_list > div.calendar > ul > li > dl")

            result = "```"
            for i in info:
                if not i.find("dd") == None:
                    text = i.text.replace("\n", "")
                    result += "%6s -%s\n" % (text.split("-")[0],
                                             text.split("-")[1])
                    for i in text.split("-")[2:]:
                        result += "%7s -%s\n" % ("", i)
            result += "```"
            return result

        except AttributeError:
            print("[오류] GSM Bot이 학사일정을 불러올 수 없습니다.")
            return "%s년 %s월 학사일정을 불러올 수 없습니다." % (today.year, today.month)

Example #6

0

Show file

    def get_hungry(self, dump):
        today = self.get_nextDay()
        nextMeal = self.get_nextMeal(today)
        item = ["아침", "점심", "저녁"]

        soup = WebCrawler().get_soup(
            "http://www.gsm.hs.kr/xboard/board.php?tbnum=8&sYear=%s&sMonth=%s"
            % (today.year, today.month))

        try:
            info = soup.select(
                "#xb_fm_list > div.calendar > ul > li > div > div.slider_food_list.slider_food%s.cycle-slideshow"
                % today.day)
            menuList = (info[0].find(
                "div", {
                    "data-cycle-pager-template":
                    "<a href=#none; class=today_food_on%s title=%s></a>" %
                    (nextMeal % 3 + 1, item[nextMeal % 3])
                }).find("span", "content").text).split("\n")

            p = re.compile("(?!에너지)[가-힣]+")  # 영양성분 문장을 제외하기 위한 정규표현식

            result = ""
            for i in menuList:
                if p.match(i.split()[0]):
                    result += ("- " + i.split()[0] + "\n")

            # result의 길이가 0이면
            if not len(result):
                raise Exception

            return result

        except:
            print("[오류] GSM Bot이 식단표를 받아올 수 없습니다.")
            return "%s 급식을 불러올 수 없습니다." % item[nextMeal % 3]

Example #7

0

Show file

File: Console.py Project: SavaStevanovic/WebCrawler

def executeCommand(webCrawler, command):
    commandParts = [part for part in command.split(' ')]
    if (commandParts[0] == 'Start'):
        if (webCrawler != None):
            webCrawler.processesShutDown()
        webCrawler = WebCrawler(commandParts[1], commandParts[2].split(','))
        webCrawler.processesEngage()
    if (commandParts[0] == 'Stop'):
        if (webCrawler != None):
            webCrawler.processesShutDown()
    return webCrawler

Example #8

0

Show file

class Test(unittest.TestCase):
    def setUp(self):
        self.crawler = WebCrawler()

    def testWebCrawler(self):
        page = (
            '<div id="top_bin"><div id="top_content" class="width960">'
            '<div class="udacity float-left"><a href="http://udacity.com">')
        self.crawler.findUrlsInPage(page)

        self.crawler.findUrlsInPage(
            '<a href="http://udacity.com">Hello world</a>')
        pass

    def testWebCrawlerFile(self):
        #         filename = "/Users/faraz/eclipse/python_workspace/hellopythonworld/udacity-source.htm"
        filename = "../udacity-source.htm"
        self.crawler.findUrlsInFile(filename)

Example #9

0

Show file

class GameBase:
    HOST = "http://www.juxiangyou.com/"
    LOGIN_INDEX_URL = HOST + "fun/play/crazy28/index"
    VERIFY_URL = HOST + "verify"
    LOGIN_POST_URL = HOST + "login/auth"
    VERIFY_CODE_FILE_PATH = "/Img/verifyCode.png"
    LOGIN_CODE_SUCCEED = 10000
    LOAD_PAGES = 50
    CHECK_INTERVAL = 20
    webCrawler = WebCrawler()

    def __init__(self, is_auto_fire=False):
        self.dbHelper = DBHelper()
        """最近开的期"""
        self.latestRound = None
        """正在进去期"""
        self.runningRound = None
        self.count_zhong = 0
        self.count_bian = 0
        self.count_xiao_bian = 0
        self.count_da_bian = 0
        self.is_internal_logged = False

        self.rules = []
        if is_auto_fire:
            self.rules.append(XiaoBianRule(self))
            self.rules.append(ZhongRule(self))
            self.rules.append(DaBianRule(self))
            self.rules.append(BianRule(self))
            self.rules.append(DanRule(self))
            self.rules.append(ShuangRule(self))
            self.rules.append(XiaoRule(self))
            self.rules.append(DaRule(self))
            # if GameBase.webCrawler is None:
            #     GameBase.webCrawler = WebCrawler()

    def get_http(self):
        return GameBase.webCrawler

    def get_header(self):
        return GameBase.get_static_header()

    @staticmethod
    def get_static_header():
        headers = {"Host": "www.juxiangyou.com",
                   "Referer": "http://www.juxiangyou.com/",
                   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2914.3 Safari/537.36",
                   "Content-Type": "application/x-www-form-urlencoded",
                   "Upgrade-Insecure-Requests": "1",
                   "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
                   "Accept-Encoding": "gzip, deflate, sdch"}
        return headers

    @staticmethod
    def get_verify_code():
        r = GameBase.webCrawler.get(GameBase.LOGIN_INDEX_URL, GameBase.get_static_header())
        if "游戏期号" in r.text:
            GameBase.close_request(r)
            return True
        r.close()
        r = GameBase.webCrawler.get(GameBase.VERIFY_URL, GameBase.get_static_header())
        verify_img = os.path.curdir + GameBase.VERIFY_CODE_FILE_PATH
        if r.status_code == 200:
            # if os.path.exists(verify_img):
            #     os.remove(verify_img)
            with open(verify_img, 'wb+') as f:
                for block in r.iter_content(1024):
                    f.write(block)
                Logger.info("当前验证码路径:{}".format(os.path.abspath(verify_img)))

        GameBase.close_request(r)
        return False

    @staticmethod
    def close_request(r):
        if r is not None:
            r.close

    @staticmethod
    def login(user, pwd, verify_code):
        data = (
            "jxy_parameter=%7B%22c%22%3A%22index%22%2C%22fun%22%3A%22login%22%2C%22account%22%3A%22{}%22%2C%22password" + \
            "%22%3A%22{}%22%2C%22verificat_code%22%3A%22{}%22%2C%22is_auto%22%3Atrue%7D").format(
            user, pwd, verify_code)
        header = GameBase.get_static_header()
        header["Referer"] = "http://www.juxiangyou.com/login/index?redirectUrl=/fun/play/crazy28/index"
        r = GameBase.webCrawler.post(GameBase.LOGIN_POST_URL, data, header)
        Logger.info(r.text)
        a = r.json()["code"]
        GameBase.close_request(r)
        return GameBase.LOGIN_CODE_SUCCEED == a

    @staticmethod
    def login_action():
        is_login = GameBase.get_verify_code()
        if not is_login:
            code = input("请录入登录信息，格式 用户名  密码   验证码：  ")
            user, pwd, code = code.split()
            is_login = GameBase.login(user, pwd, code)
        if not is_login:
            Logger.info("用户登录失败，请检查录入是否出错")
        return is_login

    @staticmethod
    def get_color_red(str):
        if str is not None:
            return colored(str, "red")
        return str

    @staticmethod
    def get_color_green(str):
        if str is not None:
            return colored(str, "green")
        return str

    def get_game_url(self):
        pass

    def get_table_name(self):
        pass

    def get_game_name(self):
        pass

    def get_rounds(self):
        # if not GameBase.login_action():
        #     return
        return self.get_pages(GameBase.LOAD_PAGES)

    def get_pages(self, page_num):
        result = False
        table_name = self.get_table_name()
        game_name = self.get_game_name()
        max_round = self.dbHelper.select_max_id(table_name)
        if self.runningRound is not None:
            if (datetime.datetime.now() - self.runningRound.date).seconds < GameBase.CHECK_INTERVAL:
                '''去除太多重复日志'''
                if not self.is_internal_logged:
                    Logger.info("游戏：{0}，当前期:{1} 还没有开奖，直接返回 {2}".format(
                        GameBase.get_color_red(game_name), GameBase.get_color_red(self.runningRound.id),
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                    self.is_internal_logged = True
                return result

        self.is_internal_logged = False
        is_end = False

        latest_round = None
        running_round = None

        for page in range(page_num):
            url = ("http://www.juxiangyou.com/fun/play/interaction/?jxy_parameter=%7B%22c%22%3A%22quiz%22%2C%22" + \
                   "fun%22%3A%22getEachList%22%2C%22items%22%3A%22{}%22%2C%22pageSize%22%3A20%2C%22" + \
                   "pageIndex%22%3A{}%7D&xtpl=fun%2Fprivate%2Fjc-index-tbl&params%5Bitems%5D={}"). \
                format(game_name, page + 1, game_name)
            r = GameBase.webCrawler.get(url, self.get_header())
            try:
                json = r.json()
            except Exception as e:
                if r is None:
                    Logger.error("JSON 解析出错:{0},JSON为空没法获取".format(e))
                else:
                    Logger.error("JSON 解析出错:{0},{1}".format(e, r.text))
                continue
            finally:
                GameBase.close_request(r)

            rounds = []
            if json is None or "itemList" not in json:
                continue

            # 数据是按时间倒序
            for item in json["itemList"]:
                num = int(item["num"])
                temp_round = RoundModel(int(item["num"]), "{0}-{1}".format(datetime.datetime.now().year, item["date"]),
                                        item["jcjg2"], int(item["jing"]), int(item["shou"]))

                # 如果有多于一条历史记录时，这个会成为最早的记录，并非最近结束期号
                if page == 0:
                    if item["iskj"]:
                        if latest_round is None:
                            latest_round = temp_round
                    else:
                        running_round = temp_round

                if item["iskj"]:
                    if num <= max_round:
                        is_end = True
                        # Logger.info("开奖期号遍历结束，当前最新期号:{0}-{1}".format(max_round, game_name))
                        break
                    else:
                        rounds.append(
                            [num, "{0}-{1}".format(datetime.datetime.now().year, item["date"]), item["jcjg2"]])

            if len(rounds) > 0:
                self.dbHelper.insert(table_name, rounds)
                if len(rounds) == 1 and latest_round is not None:
                    win_result = latest_round.shou - latest_round.jing
                    item_str = "[{0},{1},{2}]".format(GameBase.get_color_red(latest_round.id), latest_round.date,
                                                                             GameBase.get_color_red(latest_round.value))
                    if win_result > 0:
                        Logger.info(
                            "历史数据 {0} 值：{1}, U豆   **** 投：{2}, 赚:{3} ****".format(GameBase.get_color_red(game_name), item_str,
                                                                                 GameBase.get_color_red(latest_round.jing),
                                                                                 GameBase.get_color_green(win_result)))
                    else:
                        Logger.info(
                            "历史数据 {0} 值：{1}, U豆   **** 投：{2}, 赚:{3} ****".format(GameBase.get_color_red(game_name), item_str,
                                                                                 GameBase.get_color_red(latest_round.jing),
                                                                                 GameBase.get_color_red(win_result)))
                else:
                    Logger.info("{0} - 历史数据 {1}:{2}条".format(datetime.datetime.now(), game_name, len(rounds)))
            if is_end:
                break

        self.latestRound = latest_round
        if self.runningRound is not None and running_round is not None and \
                        self.runningRound.id == running_round.id:
            result = False
        else:
            result = True
        self.runningRound = running_round
        return result

    def get_rows(self, html):
        """
    <tr>
        <td>589161</td>
        <td>01-16 09:41</td>
        <td class="num">

            0+1+5=<span class="ball-num">6</span>

        </td>
        <td>
            8,617,616,861<span class="udou"></span>
        </td>
        <td style="color:#ff4c4c">

            <a style="color:#ff4c4c" class="win-list" href="/fun/play/crazy28/zjrs?id=589161">359</a>

        </td>
        <td class="st-td">
            <span class="udou"></span>
            <span class="shou kui">收：0</span><br />
            <span class="jing">竞：0</span>
        </td>
        <td>

            <span class="yikai">已开奖</span>

        </td>
    </tr>
        :param html:  html get from crazy28 index page
        :return: 每条记录：期号，时间，数值
        """
        bs = bs4.BeautifulSoup(html, "lxml")
        result = []
        for row in bs.table.children:
            if row == "\n":
                continue
            item = []
            i = 0
            for td in row.children:
                if td == "\n":
                    continue
                if i == 0:
                    if td.text.isdigit():
                        id = int(td.text)
                        if id <= self.max_round:
                            return result
                        item.append(id)
                    else:
                        break
                elif i == 1:
                    item.append("{}-{}".format(datetime.datetime.now().year, td.text))
                    pass
                elif i == 2 and "class" in td.attrs and td.attrs["class"][0] == 'num' \
                        and len(td.contents) == 3:
                    span = td.contents[1]
                    if span != "\n" and "class" in span.attrs and span.attrs["class"][0] == "ball-num" and \
                            span.text.isdigit():
                        item.append(int(span.text))
                        break
                    else:
                        item = []
                i += 1

            if len(item) > 0:
                result.append(item)
        return result

    def post_next_round(self):
        for item in self.rules:
            if item.start():
                time.sleep(3)
        pass

Example #10

0

Show file

def main():

    crawler = WebCrawler()

    test = False
    done = False

    if test:
        crawler.test()
    else:
        while (not done):
            print("Webcrawler Options")
            print("1. Crawl Web URL")
            print("2. Search Crawled Sites")
            print("3. View names of URL's Crawled")
            print("4. View words crawled")
            print("5. Delete databases")
            print("6. View Errors")
            print("7. View Stats")
            print("8. Exit")
            option = input("Select number: ")
            if (option == "1"):
                url = input("URL: ")
                depth = int(input("Depth: "))
                crawler.crawlURL(url, depth)
            elif (option == "2"):
                word = input("Search for links that contain the word: ")
                word = word.split()
                word = word[0]
                word = word.lower()
                crawler.searchWords(word)
            elif (option == "3"):
                crawler.printURLS()
            elif (option == "4"):
                crawler.printWords()
            elif (option == '5'):
                crawler.delete()
            elif (option == '6'):
                crawler.printErrors()
            elif (option == '7'):
                crawler.printStats()
            elif (option == '8'):
                print("Exiting WebCralwer, Goodbye...")
                exit()
            elif (option == "9"):
                crawler.printDoubles()
            else:
                print("Invalid Input, try again with a number.")

Example #11

0

Show file

File: crawler.py Project: ismailnguyen/WebCrawler

    for o, a in OPTS:
        if o in ("-d", "-depth"):
            DEPTH = int(a)
        elif o in ("-o", "-outside"):
            GO_OUTSIDE = True
        elif o in ("-u", "-output"):
            OUTPUT = a
            print("aaaaa : ", a)
        else:
            print "Error : -d for depth, -o for go_outside, -u for output"

    if GO_OUTSIDE == '':
        GO_OUTSIDE = False
    if DEPTH == '':
        DEPTH = 2
    if OUTPUT == '':
        OUTPUT = "results"

    CRAWLER = WebCrawler(URL, DEPTH, GO_OUTSIDE, OUTPUT)
    CRAWLER.crawl()

    #print("Dictionary : ", CRAWLER.dictionary)

    print "Save the crawling ? (y/[n]) : "
    if raw_input() == 'y':
        CRAWLER.save()

    print "Charge the crawling ? (y/[n]) : "
    if raw_input() == 'y':
        print("Dictionary: ", CRAWLER.load())

Example #12

0

Show file

File: main.py Project: NumairFazili/SGX_DataCrawler

import logging
from WebCrawler import WebCrawler
import controller

sgxCrawler = WebCrawler()
config = controller.configurations()

logging.basicConfig(filename='logOutput.log', level=config.logging_level)


def menuOptions(menu):
    if (menu == 'main'):
        print('{} Download SGX Data'.format(1))
        print('{} Get Failed Downloads'.format(2))
        print('{} Initiate Automation Script'.format(3))
        print('{} Show Configurations '.format(4))
        print('{} Help'.format(5))
        print('{} Exit'.format(0))


ASK_OPTION = 'Please Select an option: '

if __name__ == '__main__':

    options = 1

    while (options != -0):
        print('Welcome to SGX Derivatives Downloader \n')

        menuOptions('main')
        try:

Example #13

0

Show file

 def setUp(self):
     self.crawler = WebCrawler()

Example #14

0

Show file

File: Crawler.py Project: justinwerre/3630-project

from WebCrawler import WebCrawler

startLink = raw_input("Please enter a starting web address: ")
keyword = raw_input("Please enter a keyword to search for: ")
crawler = WebCrawler(keyword, startLink)

while True:
	print "Getting a web page, please wait: ", crawler.currentWebAddress
	crawler.getCurrentPage()
	if crawler.findKeyword():
		break
	crawler.nextPage()

print "Keyword found on the following page:", crawler.currentWebAddress

Example #15

0

Show file

        self.automated_scheduled_time = configParser.get(
            'Head', 'AUTOMATED_SCHEDULED_TIME')

    def printConfigurations(self):
        print('________Configurations________')
        print('LOGGING LEVEL: {}'.format(self.logging_level))
        print('AUTO SCHEDULE TIME: {}'.format(self.automated_scheduled_time))
        print('LOWER LIMIT FOR AUTOMATED DAYS: {}'.format(
            self.automated_days_download))
        print('RETRY_AUTOMATED_DOWNLOADS: {}'.format(
            self.retry_Automated_downloads))
        print('NUMBER_RETRY: {}'.format(self.number_retry))


config = configurations()
sgxCrawler = WebCrawler()


def AutoMode():
    print('Fetching Data Automatically')
    schedule.every().day.at(config.automated_scheduled_time).do(AutoUpdate)
    while True:
        try:
            schedule.run_pending()
            sys.stdout.write('\r')
            sys.stdout.write('automation standby .')
            time.sleep(1)
            sys.stdout.write('\r')
            sys.stdout.write('automation standby ..')
            time.sleep(1)
            sys.stdout.write('\r')

Example #16

0

Show file

File: crawler.py Project: Epik75/GLMF199

from WebCrawler import WebCrawler
from IndexEngine import IndexEngine

if __name__ == '__main__':
    webCrawler = WebCrawler(10, engine=IndexEngine())
    webCrawler.addSeeds('http://www.gnulinuxmag.com', 'http://www.linux-pratique.com')
    webCrawler.start()

Example #17

0

Show file

File: search.py Project: ismailnguyen/WebCrawler

if __name__ == "__main__":
    try:
        OPTS, ARGS = getopt.getopt(sys.argv[1:], "i:k:", ["input=", "keyword="])
    except getopt.GetoptError as err:
        print err
        sys.exit(2)

    for o, a in OPTS:
        if o in ("-i", "-input"):
            INPUT = a
        elif o in ("-k", "-keyword"):
            KEYWORD = a
        else:
            print "Error : -i for input, -k for keyword"

    CRAWLER = WebCrawler(URL, null, null, null, INPUT, KEYWORD)
    CRAWLER.load()

    print("Dictionary : ", CRAWLER.dictionary)

    print "Save the crawling ? (o/[n]) : "
    if raw_input() == 'o':
        print "Folder : "
    CRAWLER.save()

    print "Charge the crawling ? (o/[n]) : "
    if raw_input() == 'o':
        print "Folder : "
    CRAWLER.load()

Example #18

0

Show file

File: WebCrawlerApp.py Project: snailgo/WebCrawler

def main():
    web_crawler = WebCrawler('Google', 'http://www.google.com')
    web_crawler.run()

Example #19

0

Show file

from WebCrawler import WebCrawler

# Press the green button in the gutter to run the script.
if __name__ == '__main__':

    landmark = '白馬塔'
    searchEngine = 'Bing'

    if searchEngine == '百度':
        # 百度
        url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' \
              + landmark
        xpath = '//div[@id="imgid"]/div/ul/li/div/a/img'
    elif searchEngine == '搜狐':
        # 搜狐
        url = 'https://pic.sogou.com/pics?query=' + landmark + '&di=2&_asf=pic.sogou.com&w=05009900'
        xpath = '//div[@class="figure-result"]/ul/li/div/a/img'
    elif searchEngine == 'Google':
        # Google
        url = 'https://www.google.com.tw/search?q=' + landmark + '&tbm=isch&hl=zh-TW&tbs&sa=X&ved=0CAEQpwVqFwoTCKj3nbCZm-0CFQAAAAAdAAAAABAC&biw=1279&bih=977'
        xpath = '//img[@class="rg_i Q4LuWd"]'
    else:
        # Bing
        url = 'https://www.bing.com/images/search?q=' + landmark + '&form=HDRSC2&first=1&tsc=ImageBasicHover&scenario=ImageBasicHover'
        xpath = '//*[@id="mmComponent_images_2"]/ul/li/div/div/a/div/img'

    var = WebCrawler(url, xpath, landmark)

Example #20

0

Show file

File: WebCrawlerTestFixture.py Project: justinwerre/3630-project

	def setUp(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/Robots_exclusion_standard")

Example #21

0

Show file

File: WebCrawlerTestFixture.py Project: justinwerre/3630-project

	def testParstInternetArchive(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/internet_archive")
		self.spider.getCurrentPage()
		self.spider.findKeyword()

Example #22

0

Show file

File: WebCrawlerTestFixture.py Project: justinwerre/3630-project

class TestWebCrawler(unittest.TestCase):
	def setUp(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/Robots_exclusion_standard")

	def tearDown(self):
		del self.spider

	def testInstantiateKeyWord(self):
		self.assertEquals(self.spider.keyWord, "robot")

	def testInstantiateWebAddress(self):
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/Robots_exclusion_standard")

	def testGetCurrentPage(self):
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.currentPage.getcode(), 200)

	def testFindKeyWord(self):
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.findKeyword(), True)

	def testParseLinks(self):
		self.spider.getCurrentPage()
		self.spider.findKeyword()
		self.assertEquals(len(self.spider.links), 98)

	def testGetNextWebpage(self):
		self.spider.getCurrentPage()
		self.spider.findKeyword()
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/MediaWiki:Robots.txt")
		self.assertEquals(len(self.spider.links), 97	)
		self.spider.getCurrentPage()
		self.assertEquals(self.spider.currentPage.getcode(), 200)

	def testParstInternetArchive(self):
		self.spider = WebCrawler("robot", "http://en.wikipedia.org/wiki/internet_archive")
		self.spider.getCurrentPage()
		self.spider.findKeyword()

	def testFollowRobotDotTxt(self):
		testLinks = list()
		testLinks.append("/wiki/Special:Search")
		testLinks.append("/wiki/computers")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers")

	def testDontParseDuplicatPage(self):	
		testLinks = list()
		testLinks.append("/wiki/computers")
		testLinks.append("/wiki/computers")
		testLinks.append("/wiki/computers_hard_drives")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/computers_hard_drives")

	def testParseTheUnready(self): 
		testLinks = list()
		testLinks.append("/wiki/%C3%86thelred_the_Unready")
		self.spider.links.extend(testLinks)
		self.spider.nextPage()
		self.assertEquals(self.spider.currentWebAddress, "http://en.wikipedia.org/wiki/%C3%86thelred_the_Unready")

Example #23

0

Show file

File: BriefingBehavior.py Project: stevegbrooks/stock-search

    def getStockData(self, baseURL, endpoint, ticker, credentials, item):
        sequence = (baseURL, ticker, '&page=', endpoint, '&range=24')
        url = ''.join(map(str, sequence))
        
        wc = WebCrawler()
        wc.setDriverPath('chromedriver')
        wc.createDriver()
        wc.briefingLogin([credentials[0], credentials[1]])

        if wc.connectToURL(url):
            sleep(2)
            html = wc.getDriver().page_source
        else:
            raise Exception('Unable to connect to Briefing.com')
        
        wc.briefingLogout()
        wc.killDriver()
        
        return html

Example #24

0

Show file

from lxml import html
from WebCrawler import WebCrawler
import os

# This URL should point to the root directory page of the desired data dump.
DUMP_URL = 'https://wikileaks.org/ciav7p1/cms/index.html'
URL_PREFIX = '/'.join(DUMP_URL.split('/')[0:len(DUMP_URL.split('/')) - 1])
header = {'User-Agent': 'web-crawler'}

wc = WebCrawler(header, 10)
tree = wc.requestHTML(DUMP_URL)

fileCategories = tree.xpath('//div[@id="uniquer"]//h3/text()')
fileURLs = tree.xpath('//div[@id="uniquer"]/ul//table//td//div//a/@href')
fileTitles = tree.xpath('//div[@id="uniquer"]/ul//table//td//div//a/text()')

# Format file titles so they can be file names.
for i in range(0, len(fileTitles)):
    fileTitles[i] = fileTitles[i].replace('/', '-')
    fileTitles[i] = fileTitles[i].replace('"', '')
    fileTitles[i] = fileTitles[i].replace('\'', '')

skipExistingData = input(
    'Would you like to skip scraping files that already exist in output?(y/n)')

if skipExistingData == 'y':
    # Check to see if any of the files found on dump page have already been
    # scraped.
    pEF = os.listdir('output/')  # pEF = preExistingFiles

    for i in range(0, len(fileTitles)):