def scrapeURLs():

    # convert webpage to soup object
    r = Render('http://stats.nba.com/teams/')
    result = str(r.frame.toHtml().toAscii())
    del r
    soup = BeautifulSoup(result, 'lxml')
    del result

    # identify links to each team's stats and game logs
    team_index = soup.find_all('div', {'class': 'team-block__links'})
    soup.decompose()

    game_log_URLs = []
    stats_URLs = []

    # record urls for each team and each game log
    for team_num in range(0, len(team_index)):
        stats_URLs.append('http://stats.nba.com' +
                          team_index[team_num].contents[3]['href'])
        game_log_URLs.append('http://stats.nba.com' +
                             team_index[team_num].contents[5]['href'])

    # pickle urls
    pickle.dump(game_log_URLs, open('../../Data/gamelogURLs.pickle', 'wb'))
    pickle.dump(stats_URLs, open('../../Data/statsURLs.pickle', 'wb'))
Example #2
0
    def get_url_by_sid(self, sids):
        """
        接受歌曲id构成的字符串
        :param sids: 歌曲id串
        :return: 歌曲真实下载url
        """
        search_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids
        buffers = StringIO()
        curl = pycurl.Curl()
        curl.setopt(pycurl.URL, search_url)
        curl.setopt(pycurl.USERAGENT, user_agent)
        curl.setopt(pycurl.WRITEDATA, buffers)
        curl.perform()
        curl.close()

        body = buffers.getvalue()
        soup = BeautifulSoup(body)

        song_lists = self.serialization.json_to_data(soup.text)['data']['songList']
        soup.decompose()
        urls = []
        for l in song_lists:
            link = l['songLink']
            if not link:
                url = 'zzz'
            else:
                url = pattern.sub('', link)
            urls.append(url)
        return urls
Example #3
0
def get_dlinks(source_url):
    """
    根据网页url抓取视频的下载链接
    :param source_url: 原地址
    :return 返回视频的真实下载链接
    """
    buffers = StringIO()
    curl = pycurl.Curl()
    curl.setopt(pycurl.URL, source_url)
    curl.setopt(pycurl.USERAGENT, user_agent)
    curl.setopt(pycurl.REFERER, refer_path)
    curl.setopt(pycurl.WRITEDATA, buffers)
    curl.perform()
    curl.close()

    # 获取str类型的数据
    body = buffers.getvalue()
    soup = BeautifulSoup(body)

    # 拿到目标div
    content = soup.findAll('div', {'id': 'hi_addtab_1'})[1]
    soup.decompose()

    # 获取下载链接
    result = []
    tables = content.findAll('td', {'class': 'td_thunder'})
    for td in tables:
        d_link = td.find('a')
        href = d_link['href']
        target = d_link.text
        result.append((target, href))

    return result
Example #4
0
async def getWorkInfo(session: dict):
    allWorkInfo = []
    tasks = []

    s = requests.session()
    s.cookies.update(session)
    s.headers.update(ua)

    lock = asyncio.Lock()

    courseListURL = 'https://mooc2-ans.chaoxing.com/visit/courses/list?rss=1&start=0&size=500&catalogId=0&searchname='

    html = s.get(url=courseListURL).content.decode()
    htmlBS = BeautifulSoup(html, 'lxml')
    for singleCourse in htmlBS.find_all(class_='course-info'):  # 虽然连接是旧版本的,但是解析课程名称和教师名称更方便
        tasks.append(parseOneCourse(singleCourse, s, allWorkInfo, lock))
    htmlBS.decompose()

    print(tasks)

    await asyncio.wait(tasks)

    print("After every Task")
    print(allWorkInfo)

    return allWorkInfo
Example #5
0
def get_total_pages(url):
    try:
        print 'Pegando o total de paginas do link: ' + url
        try:
            req = get(url)
        except Exception as er:
            print 'erro: ' + str(er)
            time.sleep(5)
            return get_total_pages(url)

        soup = BeautifulSoup(req.text, 'html.parser')
        ultimo = soup.select('#pagination-flickr > li')[-1].getText() # pega o útlimo elemento
        req.close()

        if ultimo == "Next": # se for next o ultimo elemento, pega o penultimo e percorre de novo
            possivel_ultimo = soup.select('#pagination-flickr > li')[-2].getText()
            soup.decompose()
            if 'page' in url:
                return int(get_total_pages(mother_of_urls.format(penultimo='page', ultimo=possivel_ultimo)))
            else:
                return int(get_total_pages(adjust_url(url, possivel_ultimo)))
        else:
            return int(ultimo)
    except Exception as e:
        print 'Exception: ' + str(e)
        time.sleep(5)
        get_total_pages(url)
def getPageTitle(node_url, s):
    """
    Retrieve the title (from html) of a IFB node page.

    @param node_url String The url of the node
    @param s requests.Session Object corresponding to the connection session

    @return String The title of the web page
    """
    while True:
        try:
            # An authorised request.
            r = s.get(node_url, timeout=TIMEOUT)
            # print(s.get(url).status_code)
            # etc...
            break
        except requests.exceptions.ConnectionError as e:
            # print(str(e) + "\nRetrying...")
            time.sleep(10)
        except requests.exceptions.ReadTimeout as e:
            # print(str(e) + "\nRetrying...")
            time.sleep(10)

    soup = BeautifulSoup(r.text, 'html.parser')
    title = str(soup.title.string)
    soup.decompose()
    return title
Example #7
0
def download_imgs():
    page_urls = get_page_urls()
    download_count = 0
    for page_url in page_urls:
        source = requests.get(page_url).text
        soup = BeautifulSoup(source, 'lxml')

        flower_name = soup.find('h1').contents[0]
        if flower_name.endswith(' '):
            flower_name = flower_name[:-1]

        aTags = soup.find_all('a', {'data-fancybox': 'gallery'})
        for i, a in enumerate(aTags):
            file_name = f"{flower_name}-{i+1}.jpg"
            img_url = a['href']
            img_request = requests.get(img_url)

            if not os.path.exists('./img'):
                os.mkdir('./img')
            with open(f'./img/{file_name}', 'wb') as file:
                file.write(img_request.content)

            download_count += 1
            print(f"#{download_count} - Downloading {img_url}")

        soup.decompose()
    print(f"{download_count} images downloaded")
def exciteTranlate01(string00, loop_time=0):
    dataBefore = {'before': string00}
    tranlationUrl = 'http://www.excite.co.jp/world/chinese/'

    sec01 = random.randrange(5, 15)
    print('計時' + str(sec01) + '秒')
    for time_sec in range(sec01, 0, -1):
        time.sleep(1)
        print('倒數 ' + str(time_sec) + ' 秒')
    print(str(time_sec) + '秒結束')
    print('傳送翻譯資料')

    try:
        tranlationResponse = requests.post(tranlationUrl, data=dataBefore).content
        print('接收已翻譯資料')
        tranlationHtml = BeautifulSoup(tranlationResponse, 'html.parser')
        result00 = tranlationHtml.find(id="after").string
    except:
        if loop_time == 10:
            print('翻譯失敗')
        else:
            print('try除錯區,loop次數:', loop_time)
            result00 = exciteTranlate01(string00, loop_time + 1)
    if result00 is None:
        print('None除錯區,loop次數:', loop_time)
        result00 = exciteTranlate01(string00, loop_time + 1)
    elif loop_time == 10:
        print('翻譯失敗')

    tranlationHtml.decompose()
    print('片段翻譯結果:')
    print(result00)
    return result00
    def scrape_google_docs_html(self, text: str):
        strainer = SoupStrainer(property=["og:title", "og:description"])
        soup = BeautifulSoup(text, "lxml", parse_only=strainer)  # Use lxml parser to speed things up

        if soup is None:
            return None

        meta_tags = soup.find_all("meta")

        if not meta_tags:
            return None

        try:
            title = meta_tags[0]['content']
            description = meta_tags[1]['content']
        except (IndexError, KeyError):
            return None

        if title.endswith(' - Google Docs'):
            title = title[:-14]

        if description.endswith('...'):
            description = description[:-3]

        soup.decompose()  # Garbage collection
        return {'title': title, 'description': description}
 def get_link_from_iframe(self, html_parser, default_value, iframe_id, class_name):
     if html_parser.select(iframe_id):
         iframe_response = requests.get(html_parser.select(iframe_id)[0].attrs['src'])
         iframe_parser = BeautifulSoup(iframe_response.content, "html.parser")
         link = iframe_parser.select(class_name)[0].get('href')
         print(link)
         iframe_parser.decompose()
Example #11
0
def worker():
    global linkPool

    while terminator.is_alive():
        HTML = None
        page = None
        newTask = PagesToCrawl.get()  # Get next work task.
        node = createNodeFromTuple(newTask)  # Create new node.
        HTML = openURLAsHTML(node)  # Open URL.
        if node['dead'] == 0:  # If URL is live...
            page = BeautifulSoup(HTML.read().decode('utf-8', 'ignore'),
                                 "lxml")  # ...scrape data.
            scrapeNodeData(node, page)

        # If the page is live and gave a valid text/html response, it should be an option for the next parent.
        if node['dead'] == 0 and HTML.info().get_content_type() == "text/html":
            with pool_lock:
                linkPool.append(copy.deepcopy(newTask))

        # Garbage Collection
        if HTML != None: HTML.close()
        if page != None: page.decompose()

        # With lock in place, append result to temporary data set.
        with tier_lock:
            tierResults.append(copy.deepcopy(node))

        PagesToCrawl.task_done()  # Tell manager that task is complete.
    sys.exit()
Example #12
0
def get_roster_advanced_stats(team, season_end_year):
    r = get(
        f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams%2F{team}%2F{season_end_year}.html&div=div_advanced'
    )
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(str(table))[0]
        soup.decompose()
        columns = [
            'Rk', 'Unnamed: 1', 'Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
            'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
            'Unnamed: 17', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 22', 'OBPM',
            'DBPM', 'BPM', 'VORP'
        ]
        empty = ['' for _ in range(len(df))]
        new_df = pd.DataFrame(columns=columns)
        for i in range(len(columns)):
            if columns[i] in df.columns:
                new_df[columns[i]] = df[columns[i]]
            else:
                new_df[columns[i]] = empty
        new_df['YEAR'] = [
            str(season_end_year - 1) + "-" + str(season_end_year)[-2:]
            for _ in range(len(df))
        ]
    return new_df
Example #13
0
def get_roster(team, season_end_year):
    r = get(
        f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams%2F{team}%2F{season_end_year}.html&div=div_roster'
    )
    df = None
    if r.status_code == 200:
        encoding = r.encoding if 'charset' in r.headers.get(
            'content-type', '').lower() else None
        soup = BeautifulSoup(r.content, 'html.parser', from_encoding=encoding)
        print("Finding table for: " + str(season_end_year) + " " + str(team))
        table = soup.find('table')
        print("Reading html")
        df = pd.read_html(str(table))[0]
        print("Done")
        soup.decompose()
        print("Transforming df")
        df.columns = [
            'NUMBER', 'PLAYER', 'POS', 'HEIGHT', 'WEIGHT', 'BIRTH_DATE',
            'NATIONALITY', 'EXPERIENCE', 'COLLEGE'
        ]
        df['BIRTH_DATE'] = df['BIRTH_DATE'].apply(lambda x: pd.to_datetime(x))
        df['NATIONALITY'] = df['NATIONALITY'].apply(
            lambda x: x.upper() if type(x) == str else "N/A")
        df['YEAR'] = [
            str(season_end_year - 1) + "-" + str(season_end_year)[-2:]
            for _ in range(len(df))
        ]
        print("Done")
    return df
Example #14
0
def scrape_data_yahoo(url):
    '''
    scrape data from Yahoo finance
    '''
    data = []
    try:
        conn = urlopen(url)
        soup = BeautifulSoup(conn, "html.parser")
        locate_tag = soup.find("div", {"id": "Main"})
        table_tag = locate_tag.find_next("table", {"data-test": "historical-prices"})
        tbody_tag = table_tag.find_next("tbody")
        # scrape the data
        for row in tbody_tag.find_all('tr'):
            col = row.find_all('td')
            # the first is date, the other are numbers
            data.append([col[0].find_next('span').text]+
                        [float(td.find_next('span').text.replace(',','')) for td in col[1:]])
        return to_pandas_data_frame(np.array(data))
    finally:
        # properly release the resources
        try:
            if soup:
                soup.decompose()
            if conn:
                conn.close()
        except NameError:
            pass
Example #15
0
def main():
	global args, celsius
	# Valores default de los parámetros
	args={'dia' : 0,'min': False, 'max' : False, 'now' : False, 'temp' : True, 'como' : True}

	# Validar argumentos recibidos y configurar variables
	parseArgs()

	## Variables de parseo html
	# Archivo donde se guardará temporalmente la página de accuweather
	html_doc = "/tmp/clima.html"
	#ID's de div's importantes
	tiempodiv_id = "feed-tabs"
	gradosdiv_id = "bt-menu-settings"
	div_hoy_id = "detail-now"
	div_hoy_mM_id = "feature-history"

	##Lógica
	# Obtener página web de accuweather para santiago
	if DEBUG: print "Obteniendo datos desde Accuweather..."
	with open(os.devnull, "wb") as devnull:
		subprocess.check_call(["wget","http://www.accuweather.com/<AJUSTAR PARA ZONA DESEADA>", "-O",html_doc], stdout=devnull, stderr=subprocess.STDOUT)
	# Obtener sección de interés
	soup = BS(open(html_doc))
	# Ver unidad de medición
	celsius = inCelsius(soup,gradosdiv_id)
	# Guardar temperaturas y características para los 5 días
	forecast = parseForecast(soup,tiempodiv_id,div_hoy_id,div_hoy_mM_id)
	soup.decompose()
	# escribir resultado del forecast según argumentos entregados
	print getForecast(forecast)
	with open(os.devnull, "wb") as devnull:
		subprocess.check_call(["rm",html_doc], stdout=devnull, stderr=subprocess.STDOUT)
Example #16
0
def makeappdx(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    title = div.center.get_text(strip=True)
    div.center.decompose()
    font = div.find_all('font', size='2', color=None)
    for f in font:
        f.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    div.attrs.clear()
    div['class'] = 'oH1'
    formatcontent(div)
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(['<div class="xsv">', title, '</div>', text])
Example #17
0
def formatabbr(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    tbl = div.find('table')
    tbl.name = 'div'
    tbl.attrs.clear()
    tbl['class'] = 'oH1'
    tdr = div.find_all(name=re.compile(r't[dr]', re.I))
    for t in tdr:
        t.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    formatcontent(div)
    div.attrs.clear()
    div['class'] = 'RmY'
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(['<link rel="stylesheet"href="ety.css"type="text/css">', text])
Example #18
0
    def load_gamelog_stats(self, year):
        # Check if it's already cached on disk.
        if not os.path.isfile(self.get_filename_path(year)):
            content = urllib2.urlopen(self.get_game_log_url(year)).read()
            with open(self.get_filename_path(year), 'w') as f:
                f.write(content)

        # Drink the soup.
        soup = BeautifulSoup(open(self.get_filename_path(year)), 'html.parser')

        game_log = []

        for table in soup.find_all('table'):
            # Only load regular season stats.
            if table.get('id', '') == 'pgl_basic':
                for game in table.find_all('tr'):
                    game_data = {}
                    for stat in game.find_all('td'):
                        data_stat = stat.get('data-stat', None)
                        if data_stat == self.DATE_STAT:
                            game_data[data_stat] = datetime.strptime(
                                stat.get_text(), self.DATE_FORMAT)
                        elif data_stat and data_stat in self.STATS_TO_COLLECT:
                            game_data[data_stat] = int(stat.get_text())

                    if game_data:
                        game_log.append(game_data)

                # Break out once we've found and parsed the regular season stats.
                break

        soup.decompose()
        self.stats[year] = pd.DataFrame(game_log).set_index(self.DATE_STAT)
Example #19
0
    def parse_product(self, response):
        soup = BeautifulSoup(response.body, 'lxml')

        p = Product()

        for element, path in self.selectors.viewitems():
            node = soup.select_one(path)

            if not node:
                continue
            if element == 'image':
                p[element] = url_fix(urljoin(response.url, node['src']))
            else:
                p[element] = text(node)

        if 'name' in p and 'number' in p:
            p['url'] = response.url
            p['pricing'], p['discountcode'] = get_prices(soup)
            soup.decompose()
            yield p
        else:
            # Only follow links on non-product pages
            soup.decompose()
            for link in self.link_extractor.extract_links(response):
                yield Request(url=link.url)
Example #20
0
    def get_all_links(self, link):
        global SCANNED_LINKS
        global n_requests
        global n_requests_lock

        try:
            req = self.req_ses.get(link)
            with n_requests_lock:
                n_requests += 1
                if not self.silent:
                    print("[%s] T-ID: %s scanning -> %s" %
                          (str(len(SCANNED_LINKS)), str(self.threadID), link))
        except requests.exceptions.MissingSchema:
            #print('invalid url %s' % link)
            return None

        html_soup = BeautifulSoup(req.text, 'html.parser')

        links = [i.get("href") for i in html_soup.find_all('a')]
        links = [
            e for e in links
            if e not in SCANNED_LINKS and e is not None and len(e) > 5
        ]
        # file in list? search it and remove from crawling list (if present)
        html_soup.decompose()  #THIS MADE THE TRICK, NO MORE RAM WASTED!

        # add the schema and base URL to links like "/something"
        for i in range(len(links)):
            if links[i][0] == "/":
                links[i] = self.URL + links[i]

        return links
Example #21
0
def run(dates):
    logging.debug(f'Got {len(dates)} dates')

    for date in dates:
        logging.info(f"Starting to scrape matches {url}/en/results/{date}")

        page = requests.get(url + "/en/results/" + date, timeout=TIMEOUT)

        bs = BeautifulSoup(page.text, 'html.parser')
        match_links = add_lineup_to_match_link(
            get_match_links_for_league(bs, league_id))
        bs.decompose()

        valid_links = []
        for match_url, match_id in match_links:
            if match_id not in match_ids:
                valid_links.append([match_url, match_id])

        results = pool.map(get_lineup_for_match, valid_links)

        for data_obj in results:
            data_obj["date"] = date

        with open(f'lineups/data_{date}.json', 'w') as outfile:
            json.dump(results, outfile)

        for _, match_id in valid_links:
            match_ids.append(match_id)

    pool.close()
Example #22
0
def makeappdx(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    title = div.center.get_text(strip=True)
    div.center.decompose()
    font = div.find_all('font', size='2', color=None)
    for f in font:
        f.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    div.attrs.clear()
    div['class'] = 'oH1'
    formatcontent(div)
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(['<div class="xsv">', title, '</div>', text])
Example #23
0
    def findcompanyregno(self):
        soup = None
        try:
            for l in self.sitelinks:
                r = requests.get(l, {"User-Agent": ua.random})
                soup = BeautifulSoup(r.text, 'html.parser')
                pagetext = soup.findAll(text=True)

                output = ''
                for t in pagetext:
                    if t.parent.name not in blacklist:
                        output += '{} '.format(t)

                crns = re.findall(crn, output)
                crn_list = []
                if crn != []:
                    crn_list.extend(crns)
                for i in crn_list:
                    if i not in self.crns:
                        self.crns.append(i)
        except Exception as e:
            print("siteclassifier.findcompanyregno : " + str(e))
        finally:
            if soup:
                soup.decompose()
        return
Example #24
0
def formatabbr(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    tbl = div.find('table')
    tbl.name = 'div'
    tbl.attrs.clear()
    tbl['class'] = 'oH1'
    tdr = div.find_all(name=re.compile(r't[dr]', re.I))
    for t in tdr:
        t.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    formatcontent(div)
    div.attrs.clear()
    div['class'] = 'RmY'
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(
        ['<link rel="stylesheet"href="ety.css"type="text/css">', text])
Example #25
0
 def run(self) -> Counter:
     if self.use_lxml:
         elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"],
                                          load_dtd=True)
     else:
         elem_iter = ET.iterparse(self.xml_file, ["start", "end"])
     root = None
     for (event, element) in elem_iter:
         if (root is not None) and event == "start":
             root = element
             continue
         if not (element.tag in self.record_tags and event == "end"):
             continue
         if self.use_lxml:
             soup = BeautifulSoup(lxml.etree.tostring(element), "xml")
         else:
             soup = BeautifulSoup(ET.tostring(element), "xml")
         for record in soup.find_all():
             if record.name not in self.record_tags:
                 continue
             self.importer.push_record(record)
             record.decompose()
         soup.decompose()
         element.clear()
         if root is not None:
             root.clear()
     counts = self.importer.finish()
     print(counts, file=sys.stderr)
     return counts
Example #26
0
 def generate_rss_item(self, url):
     """
     :param url:
     :return:
     """
     print('fetch: %s' % url)
     html = self.request_html(url)
     if not html:
         self.url = None
         return None
     soup = BeautifulSoup(html, 'html.parser')
     title = soup.find('title').text
     div = soup.find('div', {'class': 'entry-location'})
     if not div:
         soup.decompose()
         self.url = None
         return None
     self.url = div.find('a')['href']
     soup.decompose()
     rss_item = PyRSS2Gen.RSSItem(
         title=title,
         link=url,
         description=title,
         pubDate=datetime.datetime.now()
     )
     return rss_item
Example #27
0
def getPret_BD(url):
    content = urlopen(url).read()

    # fisier = open(url, "r", encoding = "utf8")
    # content = fisier.read()

    soup = BeautifulSoup(content, "html.parser")
    #soup = BeautifulSoup(url, "html.parser")

    pret_oferta = soup.find('span', {'class': 'price'})
    pret_vechi = soup.find('span', {'class': 'retail-value'})
    titlu = soup.find('div', {'class': 'title'})

    soup.decompose()
    #fisier.close()

    #for preturi in spans
    print("Titlu: " + titlu.text)
    print("Pret Oferta: " + pret_oferta.text)
    print("Pret Vechi: " + pret_vechi.text)

    pret_ofer_filt = re.match('[0-9]+', pret_oferta.text)
    pret_vech_filt = re.match('[0-9]+', pret_vechi.text)

    return titlu.text, str(pret_ofer_filt.group(0)), str(
        pret_vech_filt.group(0))
Example #28
0
def addAllFollowers(start_username: str, depth: int):
	if depth > MAX_DEPTH:
		return
	link = getFollowersLink(start_username)
	hasNext = True
	page = 1
	
	while hasNext:

		r = urllib.request.urlopen(link + str(page) + '/').read()
		soup = BeautifulSoup(r, "html.parser")
		person_summary = soup.findAll('div', class_='person-summary')
		
		page += 1
		
		for person in person_summary:
			username_html = str(person.find('a', class_='name'))
			username = re.search('(?<=href="/).*(?=/">)', username_html).group(0)
			username_html = None
			gc.collect()
			users.add(username)
			# use a regex to find a string that starts with href="/ (dont capture it) 
			# and ends with \"> (also dont capture)
			# group(0) returns the first matched object

			if depth < MAX_DEPTH:
				addAllFollowers(username, depth+1)
		if soup.find('a', class_='next') is None:
			hasNext = False

		for a in person_summary:
			a.decompose()
		soup.decompose()
		gc.collect()
	print("done gathering followers: ", start_username)
Example #29
0
def write(input, output, wordlist):
    fp = open(input, 'r', encoding="utf-8")
    soup = BeautifulSoup(fp, "html.parser")
    for div in soup.body.find_all('div', recursive=False):
        # print(div.a.text)
        hasword = False
        # print('\n')
        # print('\n')
        for word in wordlist:
            if word in str(div.a.text):
                hasword = True
                break
        if not hasword:
            div.decompose()

            # except Exception as e:
            #     print(e)
            #     print(div)

    fw1 = open(output, 'w', encoding="utf-8")
    fw1.write(soup.prettify())
    fw1.close()
    fp.close()

    soup.decompose()
    gc.collect()
Example #30
0
    def SpiderPageWeb(self, start):
        pageURL = self.GetPageURL(start)
        headers = Util.GetHeaders(Host=self.HostURL, Referer=self.GetRefererURL(start))
        req = Util.RequestURLByGet(pageURL, headers, {})
        if not req is None:
            soup = BeautifulSoup(req.text, 'lxml')
            elements = soup.findAll(href=self.re_pattern_bloglink)
            coroutines = []
            for index, ele in enumerate(elements):
                blogLink = ele.get('href')
                blogTitle = ele.get('title')
                msg = '---------- SpiderBlogWeb:' + ('[%d/%d]' % (index+1,len(elements))) + '['+blogTitle+']'
                coroutines.append(gevent.spawn(self.SpiderBlogWeb,blogLink,pageURL,msg))
                #self.SpiderBlogWeb(blogLink, pageURL)
            gevent.joinall(coroutines)
            soup.decompose()
            
            if len(elements) <= 0:
                print '!!!!!!!!!! No blog link found !!!!!!!!!!'
                print req.text
            
#             self.filePage.write(pageURL + '\n')
#             self.filePage.flush()
        else:
            print '!!!!!!!!!! SpiderPageWeb Request is None !!!!!!!!!!'
Example #31
0
 def extract(self):
     self.assets = None
     try:
         with self.fsal.open(self.path, 'r') as html_file:
             dom = BeautifulSoup(html_file, self.PARSER)
     except Exception:
         msg = (u"Metadata extraction failed, error opening: "
                u"{}".format(self.path))
         logging.exception(msg)
         raise self.MetadataError(msg)
     else:
         data = {}
         for meta in dom.find_all('meta'):
             if all(key in meta.attrs for key in ('name', 'content')):
                 key = meta.attrs['name']
                 value = meta.attrs['content']
                 data[key] = value
             # Old style html files may have the language set via
             # <meta http-equiv="content-language">
             pragma = meta.get('http-equiv', '').lower()
             if pragma == 'content-language':
                 data['language'] = meta.get('content')
         if dom.html:
             lang = dom.html.get('lang') or data.get('language', '')
             data['language'] = lang
         if dom.title:
             data['title'] = dom.title.string
         # assets are not directly part of the metadata, but are needed
         # to be accessed from within the processor, so it's kept as an
         # instance attribute only
         self.assets = self.extract_asset_paths(dom)
         dom.decompose()
         return data
Example #32
0
 def tokener(xmldata):
     import corpkit
     """print word, using good lemmatisation"""
     from bs4 import BeautifulSoup
     import gc
     open_classes = ['N', 'V', 'R', 'J']
     result = []
     just_good_deps = SoupStrainer('tokens')
     soup = BeautifulSoup(xmldata, parse_only=just_good_deps)   
     for token in soup.find_all('token'):
         word = token.word.text
         query = re.compile(r'.*')
         if re.search(query, word):
             if lemmatise:
                 word = token.lemma.text
                 if just_content_words:
                     if not token.pos.text[0] in open_classes:
                         continue        
             result.append(word)
     # attempt to stop memory problems. 
     # not sure if this helps, though:
     soup.decompose()
     soup = None
     data = None
     gc.collect()
     return result
Example #33
0
def get_news_articles():
    url = "https://hk.news.appledaily.com/realtime/realtimelist/all?page=local"
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    r.encoding = "utf-8"
    only_div = SoupStrainer("div", {"class": "text"})
    soup = BeautifulSoup(r.content,
                         features="html.parser",
                         parse_only=only_div)
    elements = list(soup.find_all("div", {"class": "text"}))
    soup.decompose()
    links = []
    for element in elements:
        a = element.find('a')
        href = a['href']
        links.append(href)
    links = links
    pool = multiprocessing.Pool(2, maxtasksperchild=1)
    result = pool.map_async(retrieve_url, links).get()
    pool.close()
    pool.join()
    result = [
        l for l in result if "/local/" in l or "/international/" in l
        or "/china/" in l or "/breaking/" in l
    ]
    return result
Example #34
0
def parse_repository(username, url, filename):
    user_cache = CACHE.get(username)
    if not user_cache:
        user_cache = Set()
        CACHE[username] = user_cache
    if url in user_cache:
        return []
    user_cache.add(url)
    url = GITHUB_URL + url
    text = read_page(url)
    soup = BeautifulSoup(text)
    commits = soup.find_all(class_='gobutton')
    urls = []
    for a in commits:
        urls.append(a['href'])
    soup.decompose()
    results = []
    for url in urls:
        data = parse_commit(username, url)
        results.append(data)
    with contextlib.closing(open(filename, 'ab')) as csvfile:
        writer = csv.writer(csvfile,
                            delimiter='\t',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        for commit in results:
            writer.writerow(commit)
        csvfile.close()
Example #35
0
def main(max_page):
    for i in range(1, max_page):
        conn = sqlite3.connect("../dogDrip.db")
        cur = conn.cursor()
        address = cur.execute(
            "select address from dogDrip where id={ID}".format(
                ID=i)).fetchall()[0][0]
        conn.close()
        print(i)
        url = address
        request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        try:
            global bs_obj
            global html
            global title
            dog_drip_html = urlopen(request)
            bs_obj = BeautifulSoup(dog_drip_html.read(), "html.parser")
            html = str(bs_obj.find_all('div', class_="ed clearfix margin-vertical-large")[0])\
                .replace('href="', 'href="https://www.dogdrip.net')\
                .replace('src="', 'src="https://www.dogdrip.net')\
                .replace("'", '"')
            title = str(bs_obj.find_all('h4')[0])\
                .replace("'", '"')
            conn = sqlite3.connect("../dogDrip.db")
            cur = conn.cursor()
            cur.execute(
                """update dogDrip set HTTP = '{HTTP}' where id={ID}""".format(
                    HTTP='%s %s' % (title, html), ID=i))
            bs_obj.decompose()
            dog_drip_html.close()
            conn.commit()
            conn.close()
        except urllib.error.HTTPError:
            print('Oops! The post is deleted!')
            conn.close()
Example #36
0
def remove_invalid_xml_chars2(html_string):
    soup = BeautifulSoup(html_string, 'html5lib')
    text = soup.get_text()
    soup.decompose()
    return re.sub(
        u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]', '',
        text)
Example #37
0
def getPret_B24(url):
    content = urlopen(url).read()

    # fisier = open(url, "r", encoding = "utf8")
    # content = fisier.read()

    soup = BeautifulSoup(content, "html.parser")

    pret_oferta = soup.find('span', {'class': 'text-value js-price-value'})
    pret_vechi = soup.find('span', {'class': 'text-rrp js-text-rrp'})
    titlu = soup.find('h1', {'class': 'col-md-14 col-lg-14'})

    soup.decompose()
    #fisier.close()

    #for preturi in spans
    print("Titlu: " + titlu.text)
    print("Pret Oferta: " + pret_oferta.text)
    print("Pret Vechi: " + pret_vechi.text)

    pret_ofer_filt = re.match('[0-9]{3}', pret_oferta.text)
    pret_vech_filt = re.match('.+([0-9]{3}).+', pret_vechi.text)

    return titlu.text, str(pret_ofer_filt.group(0)), str(
        pret_vech_filt.group(1))
Example #38
0
def remove_invalid_xml_chars(html_string):
    soup = BeautifulSoup(html_string, 'html5lib')
    text = soup.get_text()
    soup.decompose()
    # return re.sub(r'[\xE4C6\x00-\x1F\x7F-\x9F%&<>]+','', text)
    # https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
    return ''.join(c for c in text if _valid_xml_char_ordinal(c))
Example #39
0
def iterate_pages(url):
    try:
        total_pages = get_total_pages(url)
        url_splited_len = len(url.split('/'))
        print 'preparando para iterar pela url ' + url
        for value in range(1, total_pages + 1):
            print 'pagina %i de %i' % (value, total_pages)
            # pega todos elementos da pagina 'value'

            if 'page' in url:
                req = get(mother_of_urls.format(penultimo='page', ultimo=str(value)))
            else:
                req = get(adjust_url(url, str(value)))

            soup = BeautifulSoup(req.text, 'html.parser')
            print 'pegando os elementos da url ' + req.url
            page_elements = soup.findAll('div', { 'class': 'mainBox' })
            iterate_page_elements(page_elements, req.url)
            soup.decompose()
            req.close()
        anime_list.clear_list()
        anime_list.serialize_list()
    except Exception as e:
        print 'Exception: ' + str(e)
        time.sleep(5)
        iterate_pages(url)
Example #40
0
def get_wikipedia_links(input_text):
    """Gets en.wikipedia.org link in input_text. If it can't be found, returns []"""
    
    soup = BeautifulSoup(input_text, "lxml")
    
    fixed_urls = []
    urls = re.findall(r'(https?://[^\s]+)', input_text)
    
    for url in soup.findAll('a'):
        try:
            fixed_urls.append(url['href'])
        except Exception:
            pass
    
    """Deletes duplicates"""
    done_urls = []
    for i in fixed_urls:
        if i not in done_urls:
            done_urls.append(i)
            
    """Deletes urls that contain a file extension"""
    fixed_urls = []
    for url in done_urls:
        for extension in media_extensions:
            if not extension.lower() in url.lower():
                fixed_urls.append(url)
                break
                
    soup.decompose()

    return fixed_urls
Example #41
0
def process_user(username, fullname):
    filename = 'github/{}.csv'.format(username)
    filename_tmp = '{}.tmp'.format(filename)
    with open(filename_tmp, 'a'):
        os.utime(filename_tmp, None)
    uri_param = httplib2.iri2uri(fullname.replace(' ', '+'))
    url = u'{}/search?q={}&type=Users'.format(GITHUB_URL, uri_param)
    text = read_page(url)
    soup = BeautifulSoup(text)
    user_info = soup.find(class_='user-list-info')
    if not user_info:
        os.rename(filename_tmp, filename)
        soup.decompose()
        return
    a = user_info.find('a')
    github_username = a['href'][1:]
    with open(filename_tmp, 'w') as f:
        f.write(github_username + '\n')
        f.close()
    print "link stackoverflow '{}' to github '{}'".format(
        username, github_username)
    soup.decompose()
    commits = process_days(github_username, filename_tmp)
    os.rename(filename_tmp, filename)
    if github_username in CACHE:
        del CACHE[github_username]
Example #42
0
    def feed(self, data):
        # truncate first to save memory
        self.dom.truncate(0)
        # for python3 compatibility
        self.dom.seek(0)
        soup = BeautifulSoup(data, 'html5lib')

        # since soup() is not a generator, it should be fine to iterate and
        #  edit

        # handle code block
        for div in soup.select('div[class^=highlight-]'):
            self.handle_highlight(div, soup)

        # kill useless navigation
        for div in soup.select('div.related'):
            for child in div.children:
                if child.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
                    if child.text == 'Navigation':
                        div.decompose()
                        break

        # kill table of content navigation
        for div in soup.select('div.sphinxsidebarwrapper'):
            for child in div.children:
                if child.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
                    if child.text == 'Table Of Contents':
                        div.decompose()
                        break

        # filter and edit tags
        for tag in soup():
            if tag.name not in ALLOWED_TAGS:
                if tag.name in MERCiFUL_TAG:
                    tag.replace_with_children()
                    continue
                tag.decompose()
                continue
            final_attr = dict()
            if tag.has_attr('id'):
                final_attr['id'] = tag['id']
            if tag.has_attr('style'):
                final_attr['style'] = tag['style']

            result = getattr(self, 'handle_' + tag.name,
                             self.handle_default)(tag, final_attr)
            if result is False:
                tag.decompose()
                continue
            elif result is True:
                continue
            tag.attrs.clear()
            tag.attrs.update(final_attr)

        self.output_dom(soup.html)
        self.dom.seek(0)
        soup.decompose()
Example #43
0
def scrape_bfi_films(voters_list, filmid_manual_dict):

    # initialize lists of films with header labels
    film_list = [['filmid', 'title', 'director', 'country', 'year', 'genre', 'type', 'category']]

    # add manual filmids to list
    for k,v in filmid_manual_dict.items():
        film_list.append([v[0], k.encode('UTF-8'), v[2].encode('UTF-8'), '', v[1].encode('UTF-8'), '', '', ''])

    # get list of unique filmids from voter_list
    filmid_list = []
    for i in voters_list:
        for j in i[5:-1]: filmid_list.append(j)
    filmid_list = set(filmid_list)

    # visit each of the film webpages
    for filmid in filmid_list:
        if str(filmid)[0] != '4': continue
        film_soup = BeautifulSoup(requests.get(film_url+str(filmid)).content, 'lxml')

        # extract film title and append with film id
        film_info = [filmid, film_soup.find('title').contents[0].split('(')[0].strip().encode('UTF-8')]

        # extract director(s)
        try:
            film_info.append(" & ".join([director.text for director in film_soup.find('p', text=re.compile('Director.*'), attrs={'class':'row-label'}).findNext('p').findAll('a')]).encode('UTF-8'))
        except:
            film_info.append('')

        # extract country(ies)
        try:
            film_info.append(" & ".join([country.text for country in film_soup.find('p', text=re.compile('Countr.*'), attrs={'class':'row-label'}).findNext('p').findAll('span')]).encode('UTF-8'))
        except:
            film_info.append('')

        # extract year, genre, type, and category
        for k in ['Year', 'Genre', 'Type', 'Category']:
            try:
                film_info.append(film_soup.find('p', text=k, attrs={'class':'row-label'}).findNext('p').find('span').contents[0].encode('UTF-8'))
            except:
                film_info.append('')

        # append info on this single film to the list of all films
        film_soup.decompose()
        film_list.append(film_info)
        print(film_info)

    # write film info to csv
    
    with open(csv_dir+'/bfi-films.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(film_list)
        f.close()

    return film_list
Example #44
0
    def get_rows(self, html_path):
        """Return rows for locations."""
        html_file = open(html_path, 'rb')
        soup = BeautifulSoup(html_file.read(), "html.parser")

        rows = self.parse_rows(soup)

        soup.decompose()
        html_file.close()

        return rows
Example #45
0
    def get_top_movie_list(self, source_url):
        """
        拿到推荐的250部经典电影列表
        :param source_url: 源网址
        :return:
        """
        curl = pycurl.Curl()
        curl.setopt(pycurl.USERAGENT, user_agent)
        curl.setopt(pycurl.REFERER, refer_url)

        page = 0
        extra_url = ""
        target_list = []

        while 1:
            print '正在处理第%d页' % (page + 1)
            target_url = source_url + extra_url
            buffers = StringIO()
            curl.setopt(pycurl.URL, target_url)
            curl.setopt(pycurl.WRITEDATA, buffers)
            curl.perform()

            body = buffers.getvalue()
            buffers.close()
            soup = BeautifulSoup(body, "html.parser")
            content = soup.find('div', {'id': 'content'})
            soup.decompose()
            clear_fix = content.find('div', {'class': 'article'})
            subject_list = clear_fix.find('ol', {'class': 'grid_view'}).findAll('li')
            # 这里用试探法测试页数
            if not subject_list:
                break
            for item in subject_list:
                # 获取电影名称, 评分
                name = replace_pattern.sub('', item.find('div', {'class': 'hd'}).find('a').text)
                # 处理少于10人评价的特殊情况
                rate_0 = item.find('div', {'class': 'star'}).find('span', {'class': 'rating_num'})
                if not rate_0:
                    continue
                rates = replace_pattern.sub('', rate_0.text)
                target_list.append((name, rates))

            page += 1
            extra_url = "?start=%d&filter=" % (25 * page)

        print('已处理完最后一页')
        curl.close()
        file_name = '%s.txt' % u"top250电影集合"
        self.write_to_file(target_list, u"top250电影/", file_name)
Example #46
0
    def get_book_list(self, category):
        """
        根据指定类别获取对应的书籍列表
        :param category: 分类
        :return:
        """
        curl = pycurl.Curl()
        curl.setopt(pycurl.USERAGENT, user_agent)
        curl.setopt(pycurl.REFERER, refer_url)

        page = 0
        while 1:
            print '正在处理第%d页' % (page + 1)
            url = 'http://book.douban.com/tag/%s?start=%d' % (category, page * 20)
            buffers = StringIO()
            curl.setopt(pycurl.URL, url)
            curl.setopt(pycurl.WRITEDATA, buffers)
            curl.perform()

            body = buffers.getvalue()
            buffers.close()
            soup = BeautifulSoup(body, "html.parser")

            content = soup.find('div', {'id': 'subject_list'})
            soup.decompose()
            # 这里用试探法测试页数
            clear_fix = content.find('div', {'class': 'clearfix'})
            if not clear_fix:
                break
            subject_list = content.find('ul', {'class': 'subject-list'}).findAll('li', {'class': 'subject-item'})
            target_list = []
            for item in subject_list:
                # 获取书名, 评分以及出版信息
                name = replace_pattern.sub('', item.find('h2').find('a').text)
                pub = replace_pattern.sub('', item.find('div', {'class': 'pub'}).text)
                # 处理少于10人评价的特殊情况
                rate_0 = item.find('div', {'class': 'star clearfix'}).find('span', {'class': 'rating_nums'})
                if not rate_0:
                    continue
                rates = replace_pattern.sub('', rate_0.text)
                # 抓取评分高于8.5的书籍
                if float(rates) > 8.5:
                    target_list.append((name, rates, pub))
            file_name = '%s%d.txt' % (category, page + 1)
            page += 1
            self.write_to_file(target_list, '%s/' % category, file_name)

        curl.close()
        print('已处理完最后一页')
Example #47
0
    def parse_html_(self, text):
        """
        Helper function for dealing with an HTML document
        """
        soup  = BeautifulSoup(text, 'lxml')
        title = soup.find('title').text
        body  = self.parse_body_(soup)

        # Get rid of the soup
        soup.decompose()
        del soup

        return {
            'title': title,
            'body': body
        }
Example #48
0
def crawl_cate(cate_url, f):
    request = urllib2.Request(cate_url, "", header)
    cate_page = urllib2.urlopen(request)
    shoplist_ss = SoupStrainer('a', attrs={"class": "BL", "href": re.compile('shop')})
    shoplist = BeautifulSoup(cate_page, parseOnlyThese=shoplist_ss)
    i = 0
    for a in shoplist:
        i = i + 1
        print "shop:%d" % i
        shop_url = "http://www.dianping.com/%s" % a.get('href')
        if shop_url in shop_list:
            continue
        else:
            shop_list[a.string] = 1
        f.write("%s\t%s\t" % (a.string.encode('utf-8'), shop_url.encode('utf-8')))
        crawl_shop(shop_url, f)
    shoplist.decompose()
Example #49
0
    def get_rows(self, html_path):
        """
        Return rows for details table HTML.

        :param html_path: A path to a sale file. Ex. '/path/OPR123456789.html'
        :type html_path: string
        :returns: A list of the rows in the details table.
        """
        html_file = open(html_path, 'rb')
        soup = BeautifulSoup(html_file.read(), "html.parser")

        rows = self.parse_rows(soup)

        soup.decompose()
        html_file.close()

        return rows
def crawlPage(site, title, maxDepth, pages, links, restricted = False, siteBase = ""):
    global titles
    
    try:
        print("Crawling " + site + ", with maxDepth = " + str(maxDepth))
        http = httplib2.Http()
        status, response = http.request(site)

        soupPage = BeautifulSoup(response, "html.parser", parse_only=SoupStrainer('a'))
        for link in soupPage:
            if link.has_attr('href'):
                linkedPage = link['href']
                linkedPage = urljoin(site, linkedPage)
                print("Getting title for " + linkedPage)
                
                try:
                    if not linkedPage in titles:
                        soup = BeautifulSoup(urllib2.urlopen(linkedPage), "html.parser")
                        linkTitle = soup.title.string
                        soup.decompose()
                        #titles[linkedPage] = linkTitle
                        
                    else:
                        linkTitle = titles[linkedPage]

                    links.add((title, linkTitle))
                    if not linkTitle in pages and not "youtube" in linkedPage and not (restricted and not siteBase in linkedPage):
                        pages.add(linkTitle)
                        if (maxDepth > 1):
                            crawlPage(linkedPage, linkTitle, maxDepth-1, pages, links, restricted, siteBase)

                except Exception as e:
                    print("Error parsing " + linkedPage + "! {0}".format(e))
                    links.add((title, linkedPage[linkedPage.find("http:\\")+7:]))
                    if not linkedPage[linkedPage.find("http:\\")+7:] in pages and not (restricted and not siteBase in linkedPage):
                        pages.add(linkedPage[linkedPage.find("http:\\")+7:])
                        if (maxDepth > 1):
                            crawlPage(linkedPage, linkTitle, maxDepth-1, pages, links, restricted, siteBase)

                #pages.add(linkedPage)
        soupPage.decompose()
    except Exception as e:
        print ("Error on site " + site + ": {0}".format(e))
    gc.collect()
Example #51
0
    def SpiderBlogWeb(self, blogURL, refererWeb, msg):
        headers = Util.GetHeaders(Host=self.HostURL, Referer=refererWeb)
        req = Util.RequestURLByGet(blogURL, headers, self.cookies)
        print msg
        if req:
            soup = BeautifulSoup(req.text, 'lxml')
            elements= soup.findAll(src=self.re_pattern_blogimg)
            threads = []
            for ele in elements:
                photoURL = ele.get('src')
#                 needDownload = Util.DownloadFile(photoURL), self.PhotoPath)
                threads.append(gevent.spawn(Util.DownloadFile, photoURL, self.PhotoPath, True))
#                 print str(needDownload),  photoURL
            gevent.joinall(threads)
            soup.decompose()
#             self.fileBlog.write(blogURL + '\n')
#             if len(elements) > 0:
#                 print ''
        time.sleep(random.uniform(self.SleepTimeMin, self.SleepTimeMax))
Example #52
0
  def parse(self):
    """ The main parse method. Need to call me before getting output. """
    self.result_storage = []

    for xml_file in self.xml:
        # read in nesss xml file,  turn into soup obj, and validate
        xml_obj=open(xml_file,'ro')
        xml_parser = BeautifulSoup(xml_obj,'xml')
        self.ValidateXML(xml_parser)

        # iterate through hosts found in report
        for target in xml_parser.find_all('ReportHost'):

          # skip hosts that are caught in the filter
          if self.FilterThisHost(target) == True: continue

          # turn host into a host object
          target_obj = HostObject(target, self.troll_user_check, 
                                  self.troll_link)
          
          # iterate over vulnerabilities of that host
          for vuln in target.find_all('ReportItem'):

              # filter out open ports and insert into HostObject 
              if ( vuln['pluginFamily'] == "Port scanners" 
                  and vuln['severity'] == "0"):
                target_obj.insert_report_item(vuln)

              # if the vulnerability is not filtered, feed into Hostobj
              elif self.FilterThisVuln(vuln) == False:
                target_obj.insert_report_item(vuln)
          # if any vulnerabilities exist in that host, append that to
          # a list of host objects
          if target_obj.vulns: 
              self.result_storage.append(target_obj)

        # demantle parser object, close xml file to free memory
        xml_parser.decompose()
        xml_obj.close()

    # close out the link to the inventory system lookup DB
    self.troll_link.Close()
def crawlpage(url,key,currdepth):
    currUrls = []
    append= currUrls.append
    join=urllib.parse.urljoin
    try:
            htmltext = urllib.request.urlopen(urls[0])
            # 1 second delay as courtesy 
            time.sleep(1) 
            the_text = htmltext.read()
            # converting the text in lowercase for keyphrase search
            textStr = str(the_text).lower()                
    except:
            print('no idea what is happening ',  sys.exc_info())
         
    soup = BeautifulSoup(the_text)
    soup.prettify('utf-8')
    # get canonical link from the document
    canon = soup.find("link", {"rel":"canonical"})
    canonicalurl = canon['href']
    # we will be dealing with canonical url and not with the url given in document
    if canonicalurl not in visited and key.lower() in textStr:
        visited.add(canonicalurl)
        file.write(canonicalurl + '\n')
        # Design decision :
        # We will not need any links from pages at depth 3
        # so I will be skipping them.
        # This increases the speed of the program byh atleast 10%
        # No a good idea for actual crawler
        if currdepth !=2:
            for tag in soup.findAll('a', href=True):
                x= tag['href']
                link = join(url,x)
                # trim string from #
                # actually not needed because of cannonical urls
                # used as added performace enhancement
                link = link.split('#')[0]
                if validLink(link):
                    append(link)
    soup.decompose()
    htmltext.close()
    return currUrls            
Example #54
0
def fetch(url):
    items = []

    req = Request(url)
    req.add_header('User-Agent', UserAgent().random)

    conn = urlopen(req).read()
    document = BeautifulSoup(conn, 'html.parser')
    conn.close()

    for table in document.findAll('table', {'class': 'wikitable'}):
        for row in table.findAll('tr', {'id': True}):
            columns = row.findAll('td')

            item = {}

            item['name'] = columns[0]['data-sort-value']
            item['type'] = columns[1]['data-sort-value']
            item['level'] = columns[2].text

            try:

                description = []

                for modifier in columns[len(columns) - 1].div.findAll('span'):
                    for text_line in modifier.stripped_strings:
                        description.append(text_line)

                item['description'] = description

            except AttributeError:
                print('ERROR parsing item', item)
                item['description'] = []

            finally:
                items.append(item)

    document.decompose()

    return items
Example #55
0
class ReviewExtractor(object):
    """
    Wraps an XML parser to extract data particularly from each review in
    a Goodreads review.xml file. Uses BeautifulSoup to simply parsing.
    """

    def __init__(self, path):
        self.path   = path
        self.stream = None
        self.soup   = None

    def open(self):
        self.stream = open(self.path, 'rb')
        self.soup   = BeautifulSoup(self.stream, 'xml')

    def close(self):
        if self.stream: self.stream.close() # Release file handle
        if self.soup: self.soup.decompose() # Drop the XML out of memory
        self.stream = None                  # Force garbage collection
        self.soup   = None                  # Force garbage collection

    def __enter__(self):
        """
        Open a stream to the wrapped xml file and return the extractor for
        use in contextual with ... as statements (and ensure close).
        """
        self.open()
        return self

    def __exit__(self, type, value, tb):
        """
        Ensure any open streams are closed before exiting a context block.
        """
        self.close()

    def __iter__(self):
        if not self.soup: raise Exception("No handle to an xml soup object!")
        for review in self.soup.find_all('review'):
            yield Review(review)
Example #56
0
    def get_newznab_categories(self):
        """
        Uses the newznab provider url and apikey to get the capabilities.
        Makes use of the default newznab caps param. e.a. http://yournewznab/api?t=caps&apikey=skdfiw7823sdkdsfjsfk
        Returns a tuple with (succes or not, array with dicts [{"id": "5070", "name": "Anime"},
        {"id": "5080", "name": "Documentary"}, {"id": "5020", "name": "Foreign"}...etc}], error message)
        """
        return_categories = []

        if not self._check_auth():
            return False, return_categories, "Provider requires auth and your key is not set"

        params = {"t": "caps"}
        if self.needs_auth and self.key:
            params['apikey'] = self.key

        url = posixpath.join(self.url, 'api?') + urlencode(params)
        data = self.get_url(url)
        if not data:
            error_string = u"Error getting xml for [%s]" % url
            logger.log(error_string, logger.WARNING)
            return False, return_categories, error_string

        data = BeautifulSoup(data, 'html5lib')
        if not (self._checkAuthFromData(data) and data.caps and data.caps.categories):
            data.decompose()
            error_string = u"Error parsing xml for [%s]" % self.name
            logger.log(error_string, logger.DEBUG)
            return False, return_categories, error_string

        for category in data.caps.categories.find_all('category'):
            if category.attrs and 'TV' in category.attrs.get('name', '') and category.attrs.get('id', ''):
                return_categories.append({'id': category.attrs['id'], 'name': category.attrs['name']})
                for subcat in category.find_all('subcat'):
                    if subcat.attrs and subcat.attrs.get('name', '') and subcat.attrs.get('id', ''):
                        return_categories.append({'id': subcat.attrs['id'], 'name': subcat.attrs['name']})

        data.decompose()
        return True, return_categories, ""
Example #57
0
    def get_newznab_categories(self):
        """
        Uses the newznab provider url and apikey to get the capabilities.
        Makes use of the default newznab caps param. e.a. http://yournewznab/api?t=caps&apikey=skdfiw7823sdkdsfjsfk
        Returns a tuple with (succes or not, array with dicts [{"id": "5070", "name": "Anime"},
        {"id": "5080", "name": "Documentary"}, {"id": "5020", "name": "Foreign"}...etc}], error message)
        """
        return_categories = []

        if not self._check_auth():
            return False, return_categories, "Provider requires auth and your key is not set"

        params = {"t": "caps"}
        if self.needs_auth and self.key:
            params["apikey"] = self.key

        url = ek(os.path.join, self.url, "api?") + urllib.urlencode(params)
        data = self.get_url(url)
        if not data:
            error_string = u"Error getting xml for [%s]" % url
            logger.log(error_string, logger.WARNING)
            return False, return_categories, error_string

        data = BeautifulSoup(data, "html5lib")
        if not self._checkAuthFromData(data) and data.caps and data.caps.categories:
            data.decompose()
            error_string = u"Error parsing xml for [%s]" % self.name
            logger.log(error_string, logger.DEBUG)
            return False, return_categories, error_string

        for category in data.caps.categories.findAll("category"):
            if hasattr(category, "attrs") and "TV" in category.attrs["name"]:
                return_categories.append({"id": category.attrs["id"], "name": category.attrs["name"]})
                for subcat in category.findAll("subcat"):
                    return_categories.append({"id": subcat.attrs["id"], "name": subcat.attrs["name"]})

        data.decompose()
        return True, return_categories, ""
def scrapeURLs():

    # convert webpage to soup object
    r = Render('http://stats.nba.com/teams/')
    result = str(r.frame.toHtml().toAscii())
    del r
    soup = BeautifulSoup(result, 'lxml')
    del result

    # identify links to each team's stats and game logs
    team_index = soup.find_all('div', {'class': 'team-block__links'})
    soup.decompose()

    game_log_URLs = []
    stats_URLs = []

    # record urls for each team and each game log
    for team_num in range(0, len(team_index)):
        stats_URLs.append('http://stats.nba.com' + team_index[team_num].contents[3]['href'])
        game_log_URLs.append('http://stats.nba.com' + team_index[team_num].contents[5]['href'])

    # pickle urls
    pickle.dump(game_log_URLs, open('../../Data/gamelogURLs.pickle', 'wb'))
    pickle.dump(stats_URLs, open('../../Data/statsURLs.pickle', 'wb'))
Example #59
0
def process(f):
	p = ET.Element('paper')
	fire = open(f)
	soup = BeautifulSoup(fire)
	dirname = os.path.dirname(f) + '/'
	outdir = dirname.replace('Nature', 'Nature_Processed')
	outfile = os.path.join(outdir, f.replace(dirname, '') + '.xml')
	
	try:
		t = ET.SubElement(p, 'title')
		t.text = soup.find(partial(get, "citation_title"))['content']
		
		dt = ET.SubElement(p, 'date')
		dt.text = soup.find(partial(get, "citation_date"))['content']
		
		d = ET.SubElement(p, 'doi')
		d.text = soup.find(partial(get, "citation_doi"))['content'][4:]
		
		a = ET.SubElement(p, 'authors')
		authors = None
		authors = soup.find_all(partial(get, "DC.creator"))
		if authors == []:
			authors = soup.find_all(partial(get, "dc.creator"))
		if authors == []:
			authors = soup.find(partial(get, "citation_authors"))['content']
			authors = [x.strip() for x in authors.split(',')]
		for aut in authors:
			au = ET.SubElement(a, 'author')
			au.text = aut['content']
	except:
		fire.close()
		total_failures.append(f)
		print "Basics wrong."
		return
	
	k = ET.SubElement(p, 'keywords')
	try:
		keywords = soup.find(partial(get, 'keywords'))['content'].split(',')
		keywords = map(lambda x: x.strip(), keywords)
		for keyword in keywords:
			if 'nature' in keyword:
				continue
			kwd = ET.SubElement(k, 'keyword')
			kwd.text = keyword
	except:
		pass
	
	k2 = ET.SubElement(p, 'article-keywords')
	keywords = soup.find(class_='article-keywords')
	if keywords is None:
		keywords = soup.find(class_="category")
	if keywords is not None:
		for keyword in [x for x in re.split('\s*', keywords.text) if x != '']:
			if ':' in keyword:
				continue
			kdw = ET.SubElement(k2, 'keyword')
			kdw.text = keyword.strip()

	ab = ET.SubElement(p, 'abstract')
	try:
		ab.text = soup.find(id="abs").text
	except:
		try:
			ab.text = soup.find(id="abstract").text
		except:
			ab.text = ''

	refs = soup.find_all(is_bib)
	r_tag = ET.SubElement(p, 'has-references')
	if refs != []:
		r_tag.text = 'Y'
	else:
		r_tag.text = 'N'
	r = ET.SubElement(p, 'references')
	for ref in refs:
		rf = ET.SubElement(r, 'reference')
		rft = ET.SubElement(rf, 'title')
		rfa = ET.SubElement(rf, 'authors')
		rfj = ET.SubElement(rf, 'journal')
		rfy = ET.SubElement(rf, 'year')
		rfd = ET.SubElement(rf, 'doi')
		rfu = ET.SubElement(rf, 'url')
		
		ref_sp = ref.text.strip()
		if ref_sp[0] == '.':
			ref_sp = ref_sp[1:].strip()
		authors_0 = []
		next_index = 0
		try:
			while True:
				if ref_sp[next_index] in [',','.',':',';']:
					authors_0.append(ref_sp[0:next_index])
					if ref_sp[next_index] in ['.',':',';']:
						ref_sp = ref_sp[(next_index + 1):].strip()
						break
					ref_sp = ref_sp[(next_index + 1):].strip()
					next_index = 0
				else:
					next_index += 1
		except:
			failures_0.append(f)
			continue
		# at this point we have removed the names
		journal_0 = ref.find(class_='journal')
		if journal_0 is None:
			failures.append(f)
			continue
		journal_0 = journal_0.text
		ind = ref_sp.find(journal_0)
		title_0 = ref_sp[0:ind]
		ref_sp = ref_sp[ind:]
		mtch = year_regex.search(ref_sp)
		try:
			year_0 = mtch.group(0)[:-1]
		except:
			year_0 = "-1"

		doi_0 = ''
		url_0 = ''
		links = ref.find_all(class_='reftxt')
		for link in links:
			mtch = doi_regex.search(link['href'])
			if mtch is None:
				continue
			else:
				doi_0 = mtch.group(0)
				break
		if doi_0 == '':
			try:
				url_0 = links[0]['href']
			except:
				pass

		rft.text = title_0
		rfj.text = journal_0
		rfy.text = year_0
		rfd.text = doi_0
		rfu.text = url_0
		for a_0 in authors_0:
			if 'et al' in a_0:
				continue
			rfau = ET.SubElement(rfa, 'author')
			rfau.text = a_0

	tree = ET.ElementTree(p)
	if not os.path.exists(outdir):
		os.makedirs(outdir)
	tree.write(outfile, pretty_print=True)
	fire.close()
	soup.decompose()
	tree = soup = None
	gc.collect()
Example #60
0
def getwords(page, mdict, words, dref):
    pgc = SoupStrainer('dl')
    dl = BeautifulSoup(page, parse_only=pgc).dl
    formatcontent(dl)
    for a in dl.find_all('a', href=re.compile('http://www.etymonline.com/[^\.]+\.php$')):
        href = a['href']
        p = re.compile(r'/([^\.]+)\.php', re.I)
        m = p.search(href)
        assert m
        word = ''.join(['appendix-', m.group(1)])
        a['href'] = ''.join(['entry://', word])
        if not word in dref:
            print href
            dref[word] = None
            worddef = makeappdx(getpage(href, ''))
            words.append([word, worddef])
    dts = dl.find_all('dt')
    l = len(dts)
    dds = dl.find_all('dd')
    assert l==len(dds)
    for i in xrange(0, l):
        word = dts[i].a.string.strip()
        dd = dds[i]
        dd.name = 'div'
        dd['class'] = 'FRe'
        worddef = cleansp(dd.encode('utf8'))
        pos = word.find('(')
        prop = None
        if pos > 0:
            p = re.compile(r'\(((?:[a-zA-Z \,\.]+?)?)[\.,]?(\d*)\.?\)', re.I)
            m = p.search(word[pos:].replace('./', '., '))
            assert m
            prop = m.group(1).rstrip()
            if prop:
                prop += '.'
            worddef = [m.group(2), worddef]
            word = word[:pos].rstrip()
        if word in mdict:
            idx = mdict[word]
            df = words[idx][1]
            if isinstance(df, OrderedDict):
                if prop in df:
                    df[prop].append(worddef)
                else:
                    df[prop] = [worddef]
            else:
                if prop:
                    od = OrderedDict()
                    od[''] = [['', '<div class="tHO"></div>'.join([df, ''])]]
                    od[prop] = [worddef]
                    words[idx][1] = od
                else:
                    words[idx][1] = '<div class="tHO"></div>'.join([df, worddef])
        else:
            mdict[word] = len(words)
            if prop!=None:
                od = OrderedDict()
                od[prop] = [worddef]
                words.append([word, od])
            else:
                words.append([word, worddef])
    dl.decompose()