Example #1
0
    def grab_all(self):
        self._local_setup()
        self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg'
        self._login()

        r_cookies = requests.post(self.next_url,
                                  cookies=self.cookies,
                                  verify=False)
        content = r_cookies.content.decode(self.charset)
        self.cookies = r_cookies.cookies
        '''parser, start.'''
        ''' - get colleges'''
        strainer_colleges = SoupStrainer("select", id="condition_yx")
        soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'),
                                      parse_only=strainer_colleges)
        colleges = [
            option['value'] for option in soup_colleges.select("option")
            if option['value']
        ]
        colleges_name = [
            option.get_text() for option in soup_colleges.select("option")
            if option['value']
        ]
        pretty_print(colleges_name)
        print "{0} colleges.".format(len(colleges))
        ''' - iter colleges'''
        total_courses = 0
        for i, college in enumerate(colleges):
            courses = []
            url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do'
            '''get courses'''
            for j in range(1, 15):
                data = {
                    'method': "allJxb",
                    'condition_xnd': "2012-2013",
                    'condition_xq': "1",
                    'condition_yx': college.encode('gbk'),
                    'isNeedInitSQL': "true",
                    'ksj1': j,
                    'ksj2': j,
                }
                r_courses = requests.post(url_courses,
                                          data=data,
                                          cookies=self.cookies)
                content = r_courses.content.decode('gbk')

                soup_courses = BeautifulSoup(content)
                rows = soup_courses.find_all("row")

                if len(rows) == 1:
                    continue

                for r in rows:
                    teacher = r.select("xm")[0].get_text(strip=True).replace(
                        '/', ',')
                    time_and_location_texts = r.select("sksj > tagbr")

                    lessons = self.get_lessons(time_and_location_texts)

                    course = {
                        'original_id':
                        r.select("jxbh")[0].get_text(strip=True),
                        'name':
                        r.select("kcmc")[0].get_text(strip=True),
                        'credit':
                        str(float(r.select("xf")[0].get_text(strip=True))),
                        'teacher':
                        teacher,
                        'lessons':
                        lessons,
                    }
                    courses.append(course)

            print "#{0} {1}: {2} courses.".format(
                i, colleges_name[i].encode('utf8'), len(courses))
            if len(courses) == 0:
                continue
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'ruc')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if courses != []:
                with open(os.path.join(output_dir, colleges_name[i] + '.yaml'),
                          'w') as yaml_file:
                    yaml_file.write(pretty_format(courses))
        print "Done! Totally exported {0} courses.".format(total_courses)
Example #2
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup, SoupStrainer
# from selenium import webdriver
import requests, re

##########
## http://stackoverflow.com/questions/25539330/speeding-up-beautifulsoup
session = requests.Session()
response = session.get(
    "https://www.treasury.gov/resource-center/sanctions/OFAC-Enforcement/Pages/OFAC-Recent-Actions.aspx"
)
# strainer = SoupStrainer("table")
strainer = SoupStrainer("table", {"class": "ms-rteTable-default"})

soup = BeautifulSoup(response.content, "lxml", parse_only=strainer)

# print(soup.get_text)
row_data = []
for row in soup.find_all("tr"):
    temp = []
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    row_data.append(cols)
Example #3
0
 def __init__(self, page):
     only_body = SoupStrainer('body')
     self.dom = BeautifulSoup(page, 'html.parser', parse_only=only_body)
Example #4
0
from urllib.parse import urljoin

import matplotlib
import requests
from PIL import Image, ImageDraw, ImageFont
from bs4 import BeautifulSoup, SoupStrainer

matplotlib.use('Agg')

from wordcloud import STOPWORDS, WordCloud

LINK_URL = "https://wikis.nyu.edu/plugins/pagetree/naturalchildren.action?decorator=none&excerpt=false&sort=position" \
           "&reverse=false&disableLinks=false&expandCurrent=true&hasRoot=true&pageId=20608012&treeId=0&startDepth=0" \
           "&mobile=false&ancestors=68296313&ancestors=20608012&treePageId=68296315&_=1504714430704"

only_wiki_links = SoupStrainer('div', id='children68296313-0')
only_main_content = SoupStrainer('div', id="main-content")
only_comments = SoupStrainer('div', id='comments-section')

with open(os.path.join(os.path.dirname(__file__),
                       'stopwords')) as stopwords_file:
    STOPWORDS |= set(x.strip() for x in stopwords_file.readlines())

__all__ = ['save_word_cloud']


def get_links(session: requests.Session) -> Set[str]:
    link_page = session.get(LINK_URL)

    link_soup = BeautifulSoup(link_page.content,
                              'lxml',
Example #5
0
soup_take=BeautifulSoup(this_is_html,'html.parser')

# if i want to scrape only the small portions of the data 
print("'''''''''''''''''''''''''''''''")


print('soup_take.get_text(): ', soup_take.get_text())

print("__________________soupstainer_____________________")

# Now import soupstainer class


from bs4 import SoupStrainer

give_only=SoupStrainer(id="google")
print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only))

print("_______________________________________")


give_only=SoupStrainer(id="lohit")

print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only))

print("_______________________________________")

give_only=SoupStrainer(id="match")

print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only))
Example #6
0
def insert_anchor():

    paths = [
        "globals.html", "filemanfiles.html", "Packages_Namespace_Mapping.html",
        "filemansubfiles.html", "routines.html", "packages.html"
    ]
    i = 0
    while i < len(paths):
        stype = ''
        name = ''
        path_ = ''
        entry = OrderedDict()
        entries = OrderedDict()
        jsonEntries = []
        validate = True
        header = {"name": "Methods", "isHeader": validate}
        jsonEntries.append(header)
        page = open(os.path.join(output, paths[i]), 'r').read()
        from bs4 import SoupStrainer
        if paths[i] in 'packages.html':
            stype = 'Package'
        elif paths[i] in 'routines.html':
            stype = 'Method'
        elif paths[i] in 'globals.html':
            stype = 'Global'
        elif paths[i] in 'filemanfiles.html':
            stype = 'File'
            list_fileman = []
            bsFile = bs(page, parse_only=SoupStrainer('td'))
            for a in bsFile.find_all('a'):
                text_ = ''
                entry = OrderedDict()
                add_path = '//apple_ref/cpp/Method/'
                print('Running filemanfiles')
                name = urllib.unquote(a.get('href')).encode('utf-8')
                text_ = text_ + a.text
                text = text_
                path_ = urllib.unquote(name).encode('utf-8')
                if path_ in list_fileman: continue
                add_path += text
                list_fileman.append(path_)
                entry['name'] = text_
                entry['path'] = add_path
                entry['entryType'] = stype
                jsonEntries.append(entry)
                entries['entries'] = jsonEntries
                with open(os.path.join(output, paths[i]) + ".dashtoc",
                          "w") as json_file:
                    json.dump(entries, json_file)
                json_file.close()
                try:
                    if name is not '':
                        cur.execute(
                            'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)',
                            (stype, text_, name + '#' + add_path))
                        print 'index already uploaded'
                except sqlite3.IntegrityError as err:
                    print(err)
                    # sqlite3.IntegrityError: column bar is not unique
        elif paths[i] in 'filemansubfiles.html':
            stype = 'File'
            list_fileman = []
            bsFile = bs(page, parse_only=SoupStrainer('td'))
            for a in bsFile.find_all('a'):
                entry = OrderedDict()
                add_path = '//apple_ref/cpp/Method/'
                print('Running filemansubfiles')
                name = urllib.unquote(a.get('href')).encode('utf-8')
                path_ = urllib.unquote(name).encode('utf-8')
                if path_ in list_fileman: continue
                add_path += path_[:-len('.html')]
                list_fileman.append(path_)
                entry['name'] = name[:-len('.html')]
                entry['path'] = add_path
                entry['entryType'] = stype
                jsonEntries.append(entry)
                entries['entries'] = jsonEntries
                with open(os.path.join(output, paths[i]) + ".dashtoc",
                          "w") as json_file:
                    json.dump(entries, json_file)
                json_file.close()
                try:
                    if name is not '':
                        cur.execute(
                            'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)',
                            (stype, name[:-len('.html')],
                             name + '#' + add_path))
                        print 'index already uploaded'
                except sqlite3.IntegrityError as err:
                    print(err)
        else:
            stype = 'Namespace'
            list_fileman = []
            bsFile = bs(page, parse_only=SoupStrainer('td'))
            for a in bsFile.find_all('a'):
                entry = OrderedDict()
                add_path = '//apple_ref/cpp/Method/'
                print('Running Namespaces_Packages_Mapping')
                name = urllib.unquote(a.get('href')).encode('utf-8')
                path_ = urllib.unquote(name).encode('utf-8')
                if path_ in list_fileman: continue
                add_path += path_[:-len('.html')]
                list_fileman.append(path_)
                entry['name'] = name[:-len('.html')]
                entry['path'] = add_path
                entry['entryType'] = stype
                jsonEntries.append(entry)
                entries['entries'] = jsonEntries
                with open(os.path.join(output, paths[i]) + ".dashtoc",
                          "w") as json_file:
                    json.dump(entries, json_file)
                json_file.close()
                try:
                    if name is not '':
                        cur.execute(
                            'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)',
                            (stype, name[:-len('.html')],
                             name + '#' + add_path))
                        print 'index already uploaded'
                except sqlite3.IntegrityError as err:
                    print(err)
        bsFile = bs(page, 'html5lib')
        for a in bsFile.find_all('a', attrs={'class': 'el'}):
            entry = OrderedDict()
            add_path_global = ''
            name = urllib.unquote(a.get('href')).encode('utf-8')
            text = a.text
            add_path = '//apple_ref/cpp/Method/'
            without_html = name[:-len('.html')]
            add_path_global += add_path + text
            add_path += urllib.unquote(without_html).encode('utf-8')
            entry["name"] = name[:-len('.html')]
            entry["path"] = add_path
            if stype is 'Global':
                entry['name'] = text
                entry['path'] = add_path_global
            entry["entryType"] = stype
            jsonEntries.append(entry)
            entries["entries"] = jsonEntries
            with open(os.path.join(output, paths[i]) + ".dashtoc",
                      "w") as json_file:
                json.dump(entries, json_file)
            json_file.close()
            jsonEntries_index = []
            entry = OrderedDict()
            entries_index = {}  #each entry type node of main sub-html file
            header_index = {"name": stype, "isHeader": validate}
            jsonEntries_index.append(header_index)
            raw_data = open(os.path.join(output, name), 'r')
            dom = fromstring(raw_data.read())
            indexmumps = dom.xpath('//p//span//a/@href')
            if not indexmumps: pass
            else:
                item = indexmumps[0]
                entry["path"] = indexmumps[0]
                entry["name"] = item[:-len('.html')]
                entry["entryType"] = stype
                entries_index["entries"] = entry


# #
# #     ##################################################################################
# #     ############# Validating the xpath to DOM by each href tag into each html ########
# #     ############# This should query DOM and validate stype for each Entry type #######
# #     ##################################################################################

            list_index = []
            for link in dom.xpath(
                    '//td//a/@href'
            ):  # select the url in href for all a tags(links)
                print('Index from ' + name + ' page: ' + link)
                entry = OrderedDict()
                if link in list_index: continue
                entry["name"] = link[:-len('.html')]
                entry["path"] = link
                list_index.append(link)
                if '#' in link: continue
                entry["entryType"] = stype
                jsonEntries_index.append(entry)
                entries_index["entries"] = jsonEntries_index
                with open(os.path.join(output, path_) + ".dashtoc",
                          "w") as json_index:
                    json.dump(entries_index, json_index)
                json_index.close()

    # ##################################################################################
    # ############# I want to parse and change the html with this anchor tag to ########
    # ############# identify query and Tree Explorer Interface from Zeal ###############
    # ############# Application how??????? #############################################
    # ##################################################################################

    ##################################################################################
    ############# Also I want to complete the html with the rest anchor tag   ########
    ############# and also the rest of the html document including   #################
    ############# with the "name", "path", and "entryType" included    ###############
    ############# also save the databases ############################################
    ##################################################################################

            try:
                if stype is 'Global':
                    cur.execute(
                        'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)',
                        (stype, text, name + '#' + add_path_global))

                    continue
                cur.execute(
                    'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)',
                    (stype, name[:-len('.html')], name + '#' + add_path))
                print 'index already uploaded'
            except sqlite3.IntegrityError as err:
                print(err)
        i += 1
 def parse_html_links(self, html):
     return BeautifulSoup(html,
                          parse_only=SoupStrainer('a'),
                          features='html.parser')
#(ii) soup = Beautifulsoup(html_markup, "lxml")
#(iii) soup = Beautifulsoup(html_markup, "lxml", parse_from=Soup-Strainer("a"))

#The Beautiful Soup constructor plays an important part and we will explore some of the important parameter
#her:
#(i) markup: The first parameter passed to the constructor accepts a string or objects to be parsed
#(ii) features: The name of the parser ot type of markup to be used for markup. The parser can be lxml,
#lxml-xml, html.parser, or html15. If we just want to parse some HTML, we can simply pass the markup to
#BeautifulSoup and it will use the appropriate parser installed accordingly.
#(iii) parse_only: Accepts a bs4.SoupStrainer object, that is, only parts of the document matching the
#SoupStrainer object will be used to parse.

#In this example we will be creating the soupA object using lxml as a parser, along with the SoupStrainer
#object tagsA-> parsing only <a>,that is, the elements or anchor tag of HTML

tagsA = SoupStrainer("a")
soupA = BeautifulSoup(html_doc, 'lxml', parse_only=tagsA)
soup = BeautifulSoup(html_doc, 'lxml')

#The .pretiffy() function returns a Unicode string, presents the string in a clean, formatted structure
#that is easy to read

soupA  #print
soupA.prettify()  #print

#Document-based elements (such as HTML tags) in a parsed tree can have various attributes with predefined
#values. Verifying whether the element contains certain attributes can be handy when traversing the tree
#Remember->soupA.a returns the first <a> element or tag found in the html_doc

soupA.a.has_attr("class")
soupA.a.has_attr("name")
Example #9
0
File: ly.py Project: 19manu98/Ly
        else:
            # initialise loggin configs, note: only the latest lyric's log is saved as mode = 'w'
            logging.basicConfig(
                level=logging.DEBUG,
                format='%(asctime)s %(levelname)-8s\n\n%(message)s\n',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='/tmp/ly.log',
                filemode='w')

            # get the first 7 links from DuckDuckGo search engine.
            res = urllib.request.urlopen('https://duckduckgo.com/html/?q=' +
                                         '+'.join(sys.argv[1:]) +
                                         '+lyrics azlyrics').read()
            soup = BeautifulSoup(res,
                                 'html.parser',
                                 parse_only=SoupStrainer(
                                     'a', {'class': 'result__snippet'}))
            results = soup.find_all('a', limit=7)
            visited = []
            # get the recontructed 'https://www.azlyrics.com*' url if available.
            url_info = None
            for tag in results:
                parsed = urllib.parse.urlparse(tag['href'])
                temp = urllib.parse.parse_qs(parsed.query)['uddg'][0]
                visited.append(temp)  # appending visited url for logging
                match = re.search('azlyrics..*\/lyrics', temp)
                if match:
                    url_info = temp, URLS[match.group()]
                    break

            if url_info:
                lyrics = get_lyrics(url_info)
Example #10
0
    def sources(self, data, hostDict, hostprDict):
        try:
            isMovie = (data['type'] == 'movie')
            episode = data.get('episode', '')
            pageURL = data['pageURL']
            stringConstant = data['sConstant']

            session = self._createSession(data['UA'], data['cookies'])

            xbmc.sleep(1200)
            r = self._sessionGET(pageURL, session)
            if not r.ok:
                self._logException('%s Sources page request failed' %
                                   data['type'].capitalize())
                return None
            pageHTML = r.text
            timeStamp = self._getTimeStamp(pageHTML)

            # Get a HTML block with a list of host names and internal links to them.

            session.headers[
                'Referer'] = pageURL  # Refer to this page that "we're on" right now to avoid suspicion.
            pageID = pageURL.rsplit('.', 1)[1]
            token = self._makeToken({'ts': timeStamp}, stringConstant)
            xbmc.sleep(200)
            serversHTML = self._getServers(pageID, timeStamp, token, session)

            # Go through the list of hosts and create a source entry for each.

            sources = []
            tempTokenData = {
                'ts': timeStamp,
                'id': None,
                'server': None,
                'update': '0'
            }
            baseInfoURL = self.BASE_URL + self.INFO_PATH

            soup = BeautifulSoup(serversHTML,
                                 'html.parser',
                                 parse_only=SoupStrainer('div', {
                                     'class': 'server row',
                                     'data-id': True
                                 },
                                                         recursive=False))
            for serverDIV in soup:
                tempTokenData['server'] = serverDIV['data-id']
                hostName = serverDIV.label.text.strip().lower()
                hostName = self.DEBRID_HOSTS.get(hostName, hostName)

                for a in serverDIV.findAll('a', {'data-id': True}):
                    # The text in the <a> tag can be the movie quality ("HDRip", "CAM" etc.) or for TV shows
                    # it's the episode number with a one-zero-padding, like "09", for each episode in the season.
                    label = a.text.lower().strip()
                    hostID = a[
                        'data-id']  # A string identifying a host embed to be retrieved from putlocker's servers.

                    if isMovie or episode == str(int(label)):
                        if isMovie:
                            if 'hd' in label:
                                quality = 'HD'
                            else:
                                quality = 'SD' if ('ts' not in label and 'cam'
                                                   not in label) else 'CAM'
                        else:
                            quality = 'SD'

                        tempTokenData['id'] = hostID
                        tempToken = self._makeToken(tempTokenData,
                                                    stringConstant)

                        # Send data for the resolve() function below to use later, when the user plays an item.
                        # We send the CF cookies from the session (instead of reusing them from data['cfCookies'])
                        # because they might've changed.
                        unresolvedData = {
                            'url':
                            baseInfoURL % (timeStamp, tempToken, hostID,
                                           tempTokenData['server']),
                            'UA':
                            data['UA'],
                            'cookies':
                            session.cookies.get_dict(),
                            'referer':
                            pageURL + '/' + hostID
                        }
                        sources.append({
                            'source': hostName,
                            'quality': quality,
                            'language': 'en',
                            'url':
                            unresolvedData,  # Doesn't need to be a string, just repr()-able.
                            'direct': False,
                            'debridonly': False
                        })
            return sources
        except:
            self._logException()
            return None
Example #11
0
__author__ = "Samaun Ibna Faiz"

import json
from urllib import request
from bs4 import BeautifulSoup, SoupStrainer

################################################
# Important conference event dates/deadlines   #
################################################

source = 'https://acl2020.org/'

page_content = SoupStrainer('section', class_='page__content')
soup = BeautifulSoup(request.urlopen(source),
                     'html.parser',
                     parse_only=page_content)

important_dates = [{
    'Event': (c := r.find_all('td'))[0].text,
    'day': c[1].text.replace('\u2013', '-'),
    'date': c[2].text.replace('\u2013', '-')
} for r in soup.find('h2', {
    'id': 'dates'
}).find_next_sibling('center').select('table tbody tr')]

print(json.dumps(important_dates, indent=4))
################################################
# Accepted tutorials list                      #
################################################

source = 'https://acl2020.org/program/tutorials/'
Example #12
0
    def handle_one_page(self, driver):
        """重载父类方法,实现具体的爬虫操作"""

        url = self.entrance_url
        keyword = self.product_type
        driver.get(url)
        time.sleep(10)
        print "Inittial Page:", url
        # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay
        #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword)

        driver.find_element_by_xpath("//input[@type='search']").clear()
        driver.find_element_by_xpath("//input[@type='search']").send_keys(
            keyword)

        # 获取按钮对象并点击按钮
        # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型
        elem = driver.find_element_by_xpath(
            '//*[@id="hFull"]/div[2]/div[1]/button')
        elem.click()

        time.sleep(20)  # 需要暂停一两秒,防止页面未跳转
        print "Get Crawer Home Page:", driver.current_url

        i = 0
        while i < 500:
            # 获取当前网页html文档
            response_html = self.get_htmlcontent(driver.current_url)
            try:
                if response_html.status_code is not 200:
                    print "Get status_code, but Exception:response_html.status_code=", response_html.status_code
                    break
            except:
                print "Exception:response_html.status_code=", response_html.status_code
                break

            # 仅提取内容部分的文档,方便解析提速
            html_part_id_value = "lpBloc"
            # only_content_tags = SoupStrainer("ul", id=html_part_id_value)
            only_content_tags = SoupStrainer(id=html_part_id_value)
            html_part_content = BeautifulSoup(
                response_html.text,
                "html.parser",
                parse_only=only_content_tags).prettify()

            # 解析所需的所有链接
            soup = BeautifulSoup(html_part_content,
                                 "html.parser",
                                 from_encoding="utf-8")
            # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I))
            links = soup.find_all('a',
                                  class_='jsQs')  # , href=re.compile("Phone")
            for link in links:
                new_url = link['href']
                self.handle_result_url(new_url, keyword, i)
                time.sleep(10)

            i = i + 1

            # current_page = "a.pg  curr"
            # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished"

            try:
                # nextPage = "a.gspr.next"
                # driver.find_element_by_css_selector(nextPage).click() #ebay

                # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click()  # Cdicount

                nextPage = "a.jsNxtPage.pgNext"
                driver.find_element_by_css_selector(nextPage).click()
                print driver.current_url
                time.sleep(20)
            except:
                print "Exception:Get Next page Fail", response_html.status_code
                break

        driver.quit()
        self.db.close()
Example #13
0
    bibtex = bibtex.strip()

    # figure out which Journal (if any) this is
    PDFURL = None
    match = re.search('[jJ]ournal\s*=\s*\{(.*?)\}', bibtex)
    Journal = ''
    if match:
        Journal = match.group(1)

    # get PDF for Wind Energy
    if Journal == 'Wind Energy' or Journal == 'Wind Energ.':
        PDFURL = 'http://onlinelibrary.wiley.com/doi/' + doi + '/pdf'

        # need to do additional parsing to get directly link to PDF
        r = requests.get(PDFURL)
        only_iframe = SoupStrainer('iframe', {'id': 'pdfDocument'})
        webpage = BeautifulSoup(r.text, parse_only=only_iframe)
        if webpage.iframe is not None:
            PDFURL = webpage.iframe['src']

    # [INSERT HERE: if you want to try to auto link a PDF from some other journal
    #   follow the example above for Wind Energy.  I've already parsed out the
    #   journal name.  You could potentially parse out other bits of info from the
    #   BibTeX as search criteria. ]

    # show bibtex
    sys.stdout.write(bibtex)

elif action == 'url':

    call(['open', 'http://dx.doi.org/' + doi])
Example #14
0
    def __init__(self, markup='lxml', is_async=True):
        self.is_async = is_async

        parser = self.get_parser()
        parser.add_argument('--include_comments',
                            help='include comments',
                            action='store_true')
        parser.add_argument('--comments_per_page',
                            help='comments per page to be crawled',
                            default=40,
                            type=int)
        parser.add_argument('--gallery_id',
                            help='specify gallery id such as: cat, dog',
                            default='cat',
                            type=str)
        parser.add_argument('--init_post_id',
                            help='initial post_id to start crawling',
                            default=0,
                            type=int)
        parser.add_argument('--final_post_id',
                            help='final post_id to stop crawling',
                            default=10000,
                            type=int)
        parser.add_argument('--forever',
                            help='try crawling for forever',
                            action='store_true')
        parser.add_argument('--timeout',
                            help='crawling timeout per request',
                            default=5,
                            type=float)
        parser.add_argument(
            '--interval',
            help='crawling interval per request to prevent blocking',
            default=0.5,
            type=float)
        parser.add_argument(
            '--metadata_to_dict',
            help='return metadata into dictionary type',
            action='store_true',
        )
        parser.add_argument('--filename',
                            help="filename to be saved.",
                            default="gallery.txt")

        self.options, _ = parser.parse_known_args()
        self._session = requests.Session()
        self._markup = markup
        self._view_url = 'http://gall.dcinside.com/board/view'
        self._comment_view_url = 'http://gall.dcinside.com/board/view'
        self._current_post_id = self.options.init_post_id

        self._strainer = SoupStrainer(
            'div',
            attrs={
                'class': [
                    're_gall_top_1',  # 제목, 글쓴이, 작성시각
                    'btn_recommend',  # 추천, 비추천
                    'gallery_re_title',  # 댓글
                    's_write',  # 본문
                ]
            })
        # Custom header is required in order to request.
        self.header = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
        }
fifthDeclensionEndingsMFSg = ['es', 'ei', 'ei', 'em', 'e', 'es', 'e']
fifthDeclensionEndingsMFPl = ['es', 'erum', 'ebus', 'es', 'ebus', 'es', 'ebus']

client = MongoClient()
db = client.LATIN_DICT
#db.words.delete_many({})
#result = db.words.insert_one({ "puella" :["puella, puellae F"]})

for character in letters:
    http = httplib2.Http()
    status, response = http.request(
        'http://latin-dictionary.net/list/letter/' + character)
    for counter, word in enumerate(
            BeautifulSoup(response,
                          parseOnlyThese=SoupStrainer('li',
                                                      {'class': 'word'}))):
        if counter % 200 == 0:
            print counter
        tmp = word.contents
        link = word.contents[0]['href']
        #print link
        if len(tmp) == 2:
            words = re.split(
                '\s+', tmp[0].get_text().strip().replace(',', '').replace(
                    '(', '').replace(')', '').replace('.', ''))
            wordType = tmp[1].strip()

            # handle nouns
            if wordType == 'n':
                if len(words) != 2:
                    print words
            'Connection attempted a redirect (fcrov_data.py). Trying again in 2 minutes...'
        )

        t = time.ctime()
        print(t)

        time.sleep(120.0 - ((time.time() - starttime) % 120.0))

    else:

        print('Connection successful (fcrov_data.py).')

        ########## Begin processing response from HTTP request

        # Create filter with SoupStrainer to limit parsing to main div | This id may change, watch out
        res_filter = SoupStrainer('div', {'id': 'gems_results'})

        # Grab the strained soup
        soup = BeautifulSoup(res.content, 'lxml', parse_only=res_filter)

        ########### Cooking soup / breaking down content

        soup.p.wrap(soup.new_tag("table"))
        soup.p.wrap(soup.new_tag("tr"))
        soup.p.wrap(soup.new_tag("td"))

        # Create an array of tag attributes to remove
        REMOVE_ATTRIBUTES = [
            'style', 'style', 'class', 'border', 'align', 'valign',
            'cellpadding', 'cellspacing', 'colspan', 'width'
        ]
    def list_contents2(self):
        if DEBUG:
            self.log('content_list2()')
        if self.parameters('key') == 'showing':
            page_data = fetch(SHOWING_URL).text
            tlink = SoupStrainer('div', {'id': 'main'})
        else:
            year, month, _ = datetime.date.today().isoformat().split('-')
            page_data = ''
            nyear = int(year)
            for i in range(4):
                nmonth = int(month) + i
                if nmonth > 12:
                    nmonth = nmonth - 12
                    nyear = int(year) + 1
                url = COMING_URL.format(nyear, nmonth)
                page_data += fetch(url).text
            tlink = SoupStrainer('div', {'class': 'list detail'})

        mdiv = BeautifulSoup(page_data, "html.parser", parse_only=tlink)
        videos = mdiv.find_all('table')
        h = html_parser.HTMLParser()

        for video in videos:
            vdiv = video.find('a', {'itemprop': 'trailer'})
            if vdiv:
                videoId = vdiv.get('href').split('?')[0].split('/')[-1]
                plot = h.unescape(video.find(class_='outline').text).strip()
                tdiv = video.find(class_='image')
                icon = tdiv.find('img')['src']
                title = tdiv.find('img')['title']
                # imdb = tdiv.find('a')['href'].split('/')[-2]
                poster = icon.split('_')[0] + 'jpg'
                infos = video.find_all(class_='txt-block')
                director = []
                directors = infos[0].find_all('a')
                for name in directors:
                    director.append(name.text)
                cast = []
                stars = infos[1].find_all('a')
                for name in stars:
                    cast.append(name.text)
                labels = {'title': title,
                          'plot': plot,
                          # 'imdbnumber': imdb,
                          'director': director,
                          'cast': cast}
                try:
                    year = int(re.findall(r'\((\d{4})', title)[0])
                    title = re.sub(r'\s\(\d{4}\)', '', title)
                    labels.update({'title': title, 'year': year})
                except IndexError:
                    pass

                listitem = xbmcgui.ListItem(title)
                listitem.setArt({'thumb': poster,
                                 'icon': icon,
                                 'poster': poster,
                                 'fanart': _fanart})

                listitem.setInfo(type='video', infoLabels=labels)

                listitem.setProperty('IsPlayable', 'true')
                url = sys.argv[0] + '?' + urllib.parse.urlencode({'action': 'play',
                                                                  'videoid': videoId})
                xbmcplugin.addDirectoryItem(int(sys.argv[1]), url, listitem, False)

        # Sort methods and content type...
        xbmcplugin.setContent(int(sys.argv[1]), 'movies')
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_UNSORTED)
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_VIDEO_TITLE)
        if force_mode:
            xbmc.executebuiltin('Container.SetViewMode({})'.format(view_mode))
        # End of directory...
        xbmcplugin.endOfDirectory(int(sys.argv[1]), True)
Example #18
0
import requests
from bs4 import BeautifulSoup,SoupStrainer

url = 'http://en.wikipedia.org/wiki/Category:Crimes'

strain = SoupStrainer(id='mw-pages')
soup = BeautifulSoup(requests.get(url).text, parse_only=strain)
links = soup.find_all('a')

weird_shit = list(set([u'L\xe8se-majest\xe9',
              u'learn more',
              u"1788 Doctors' Riot",
              u'EAFCT',
              u'Qatl',u'TWOC',]))

crimes = sorted([ link.text for link in links if len(link.text) > 0
                    and link.text not in weird_shit ])


with open('crimes.txt', 'w') as f:
    f.write('\n'.join(crime for crime in crimes))
                url=
                'https://shop.tcgplayer.com/productcatalog/product/getpricetable?'
                'captureFeaturedSellerData=True&pageSize=100&productId={0}'.
                format(product['productId']),
                headers={
                    'User-Agent': 'Mozilla/5.0',
                    'Authorization': "Bearer {0}".format(token)
                }).text
        except Exception:
            continue
    # Creates a BeautifulSoup object with the retrieved HTML, then does find to get result set
    listings = BeautifulSoup(
        response,
        'html.parser',
        parse_only=SoupStrainer("script",
                                attrs={'type':
                                       'text/javascript'})).find_all("script")

    if listings:
        product_listings = []
        listings.pop(0)
        for listing in listings:
            try:
                result = listing.contents[0].split('\r\n')
                this_listing = {}
                # the string manipulation of these items assumes standard format where the desired item appears after a colon
                # and is formatted as "<desired item>", html unescape takes care of escape sequences, however since the
                # content is in a string format it leaves behind the leading \\, so this also assumes that no strings will
                # purposefully have a \\ in them, and removes all instances of \\ from strings
                for item in result:
                    if item.find('"set_name":') > 0:
Example #20
0
def main(OutputFileName="DITCourseList.csv", FileDelimiter=";", GetCoursesFromURL='http://www.dit.ie/catalogue/Programmes/Search', BaseURL='http://www.dit.ie', WebPageLoadDelay=10):
    #
    # Create files to store the output in (w)rite mode and add the header to the FileDelimiter specified in the function parameters 
    MyCSVFile = open(OutputFileName, "wb")    
    CourseList = csv.writer(MyCSVFile, delimiter=FileDelimiter)
    # This strainer is used to only import the table in the search page
    TableStrainer = SoupStrainer("table")
    # This strainer is used to only import the div containing the programme/module details on the individual pages 
    ProgModDetailsStrainer = SoupStrainer("div",id="progmod_detail")
    ProgContentStrainer = SoupStrainer("div", class_="progmod_content")
    URLToParse = GetCoursesFromURL
    #Create a dictionary for the programme tabs
    ProgTabs = []
    ProgTabsContent=""
    ModuleText =''
    # Open the webpage using 
    WebContent = requests.get(URLToParse,timeout=WebPageLoadDelay)
    #Parse the content using soup but only parse the table tags
    DITTable = BeautifulSoup(WebContent.text, "html.parser",parse_only=TableStrainer)
    #print DITTable.prettify(formatter="html")
    CourseList.writerow(['Dept', 'link', 'CourseName','CourseAward', 'CourseCode','CourseLevel', 'CourseDelivery', 'Duration', 'CourseNFQLevel'])
    
    #Get the rows in the table
    rows = DITTable.find_all('tr')
    
    for row in rows:
        data = row.find_all("td")
        # Var = data[index].get_text() returns the Unicode text of the cell i.e the contents wrapped in a unicode string
        CourseTitle = str(data[0].get_text())
        CourseLink = BaseURL + str(data[0].find('a').get('href'))
        CourseCode = data[1].get_text()
        CourseLevel = data[2].get_text()
        CourseAward= data[3].get_text()
        #Replace Level with a blank string, then strip all the extra whitespace from the string leaving just the NQAI number value
        CourseNQAI = replace(str(data[4].get_text()),"Level",'').strip()
        CourseMode = data[5].get_text()
        CourseLength = data[6].get_text()
        CourseSchool = data[7].get_text()
        #print("Writing to file ",CourseSchool,CourseLink,CourseTitle,CourseAward,CourseCode,CourseLevel,CourseMode,CourseLength,CourseNQAI) 
        CourseList.writerow([CourseSchool,CourseLink,CourseTitle,CourseAward,CourseCode,CourseLevel,CourseMode,CourseLength,CourseNQAI])
        #Push the changes from buffer to disk for the csv file so the csv file will always be up to date even if the file hasn't been parsed already
        MyCSVFile.flush()
        FileNameToWrite = CourseCode+".html"
        #If the file doesn't exist already in the current directory then build it
        if not os.path.isfile(FileNameToWrite):
            #Get the text data for the programme
            with requests.Session() as WebSession:
                ProgContent = WebSession.get(CourseLink,timeout=WebPageLoadDelay)
                #Parse the contents of the programme page but strain it so only the relevant details are left
                ProgSoup = BeautifulSoup(ProgContent.text,"html.parser",parse_only=ProgModDetailsStrainer)
                #print(ProgSoup.prettify(formatter="html"))
                print("Processing ",CourseLink, " now...")
                #Open the file where the text will be saved
                MyHTMLFile = codecs.open(FileNameToWrite, "w",encoding='utf-8')
                HeaderText = "<h1>Text for Course "+CourseCode +" "+ CourseTitle +" </h1>"
                MyHTMLFile.write(HeaderText)
                MyHTMLFile.write(CourseLink)
                #If the tab dictionary is empty
                if not ProgTabs:
                    #Get the programme tabs urls 
                    ProgTabs = get_navi_tabs(ProgSoup)
                #Get the separate tabs for this programme
                print(ProgTabs)
                for Tab in ProgTabs:
                    #print(Tab)
                    TabUrl = CourseLink + str(Tab)
                    response = WebSession.get(TabUrl)
                    print("TabURL----",response," for ", TabUrl)
                    print(response)
                    #ProgContentTabs = urllib2.urlopen(TabUrl)
                    print("Processing ", Tab ," for course", CourseTitle)
                    ProgContent = BeautifulSoup(response.text,"html.parser",parse_only=ProgContentStrainer)
                    #Create a header based off the tab value and write it to the file
                    HeaderText = str(Tab).replace("?tab=", '').strip()
                    print("Adding ",HeaderText,"to the file for ",CourseTitle)
                    HeaderText = "<h2>" + HeaderText + "</h2>"
                    MyHTMLFile.write(HeaderText)
                    #If the tab is the Programme Structure tab
                    if "Programme Structure" in TabUrl:
                            print("Getting the module contents for ",CourseTitle, "on ",TabUrl)
                            #ModuleText = ParseModulePages(ProgContent, TabUrl,ProgModDetailsStrainer,BaseURL)
                            #ProgTabsContent ="<div id=" +"moduleContent" +" >" + ModuleText + "</div>"
                            #get the module urls and parse them 
                            for Modulelink in ProgContent.findAll('a'):
                                FullLink = str(BaseURL + Modulelink.get('href'))
                                print("Processing the module url by calling a function..", FullLink)
                                ModuleText = ModuleText + ParseModulePages(FullLink,ProgModDetailsStrainer,ProgContentStrainer,WebPageLoadDelay, BaseURL='http://www.dit.ie')
                                # Now outside the loop write the module text to the file
                                MyHTMLFile.write(ModuleText)
                    else:                        
                            #print(ProgContent.prettify(formatter="html"))
                            ProgTabsContent = ProgContent.prettify(formatter="html") 
                            #Write the contents to the tab after wrapping it in a 
                            ProgTabsContent = "<div id="+str(CourseCode)+" >" + ProgTabsContent +"</div>"
                            MyHTMLFile.write(ProgTabsContent)
                            MyHTMLFile.close
                            #Clear the module text and ProgTabsContent before the next iteration of the loop
                            ModuleText =''
                            ProgTabsContent =''
        else:
        # The file by that name already exists (Used to overcome the timeouts for requests after about 250 files were downloaded and lets me build up the documents in batches
            print(FileNameToWrite," already exists so not processing it again")
                    
    # Close the csv file
    print('File', MyCSVFile.name ,' closed')
    MyCSVFile.close
    #MyHTMLFile.close()    
        
    # Exit successfully
    sys.exit(0)
Example #21
0
def _scrape_xratescom_exchange_rates(url: str) -> Dict[Asset, Price]:
    """
    Scrapes x-rates.com website for the exchange rates tables

    May raise:
    - RemoteError if we can't query x-rates.com
    """
    log.debug(f'Querying x-rates.com stats: {url}')
    prices = {}
    try:
        response = requests.get(url=url, timeout=DEFAULT_TIMEOUT_TUPLE)
    except requests.exceptions.RequestException as e:
        raise RemoteError(f'x-rates.com request {url} failed due to {str(e)}') from e

    if response.status_code != 200:
        raise RemoteError(
            f'x-rates.com request {url} failed with code: {response.status_code}'
            f' and response: {response.text}',
        )

    soup = BeautifulSoup(
        response.text,
        'html.parser',
        parse_only=SoupStrainer('table', {'class': 'tablesorter ratesTable'}),
    )
    if soup is None:
        raise RemoteError('Could not find <table> while parsing x-rates stats page')
    try:
        tr = soup.table.tbody.tr
    except AttributeError as e:
        raise RemoteError('Could not find first <tr> while parsing x-rates.com page') from e

    while tr is not None:
        secondtd = tr.select('td:nth-of-type(2)')[0]
        try:
            href = secondtd.a['href']
        except (AttributeError, KeyError) as e:
            raise RemoteError('Could not find a href of 2nd td while parsing x-rates.com page') from e  # noqa: E501

        parts = href.split('to=')
        if len(parts) != 2:
            raise RemoteError(f'Could not find to= in {href} while parsing x-rates.com page')

        try:
            to_asset = Asset(parts[1])
            if not to_asset.is_fiat():
                raise ValueError
        except (UnknownAsset, ValueError):
            log.debug(f'Skipping {parts[1]} asset because its not a known fiat asset while parsing x-rates.com page')  # noqa: E501
            tr = tr.find_next_sibling()
            continue

        try:
            price = deserialize_price(secondtd.a.text)
        except DeserializationError as e:
            log.debug(f'Could not parse x-rates.com rate of {to_asset.identifier} due to {str(e)}. Skipping ...')  # noqa: E501
            tr = tr.find_next_sibling()
            continue

        prices[to_asset] = price
        tr = tr.find_next_sibling()

    return prices
Example #22
0
    worker_statement,
    [worker_values[0], worker_values[1], worker_values[2], worker_values[3]])

#Begin parsing Hits

#Find dates to update
cur.execute(
    """SELECT DISTINCT date FROM hitdb WHERE status NOT IN ('Paid','Rejected') AND workerID = %s ORDER BY date;""",
    [worker_ID])
pending_hits_list = cur.fetchall()
pending_date_list = []
pending_link_list = []

pending_status = br.open('https://www.mturk.com/mturk/status')
status_soup = pending_status.read()
status_soup = BeautifulSoup(status_soup, parse_only=SoupStrainer('a'))


def gather_status_links():
    for pending_date in pending_hits_list:
        pending_date = str(pending_date[0])
        hitattr = pending_date.split("-")
        dateswap = hitattr[1] + hitattr[2] + hitattr[0]
        pending_date_list.append(dateswap)

    for pending_link in status_soup:
        if pending_link.has_attr('href'):
            if "statusdetail?encodedDate" in pending_link[
                    'href'] and pending_link['href'].split('=')[-1] > max(
                        pending_date_list):
                pending_link_list.append(pending_link['href'])
Example #23
0
if search_type not in ('index', 'url', 'fix', 'all'):
    print('Search type must be index, url, fix, or all')
    sys.exit(1)

if search_type in ('index', 'url') and len(sys.argv) < 3:
    print('Input url')
    sys.exit(1)


# -- globals

domain_base = 'https://tvtropes.org'
uri_base = '/pmwiki/pmwiki.php/'

atoz = re.compile('Tropes(.|No)(To.)*$')
strainer = SoupStrainer('div', {'id': 'main-article'})

wanted_groups = (
    "Animation", "Anime", "AudioPlay", "ComicBook", "ComicStrip", "Disney", "Film", "Franchise", "LetsPlay", "LightNovel", "Literature", "Machinima", "Manga", "Manhua", "Manhwa",
    "Music", "Podcast", "Radio", "Series", "Theatre", "VideoGame", "VisualNovel", "WebAnimation", "Webcomic", "WebOriginal", "WebVideo", "WesternAnimation"
)

sleep_delay = 0.5

# --

lower_wanted_groups = tuple([g.lower() for g in wanted_groups])

cp1252 = {
    # from http://www.microsoft.com/typography/unicode/1252.htm
    u"\x80": u"\u20AC",  # EURO SIGN
Example #24
0
import requests
from bs4 import BeautifulSoup, SoupStrainer
import alfred
import sys

# from common import waitForPeriodInQuery

# get query from user
# query = waitForPeriodInQuery('Search AIAA Aerospace Research Central', 'aiaa.png')
query = sys.argv[1]

# grab search data
params = {'searchText': query, 'pageSize': 10}
r = requests.get('http://arc.aiaa.org/action/doSearch', params=params)

only_table = SoupStrainer('table', 'articleEntry')
articles = BeautifulSoup(r.text, 'html.parser', parse_only=only_table)

# soup = BeautifulSoup(r.text)
# articles = soup.find_all('table', {'class': 'articleEntry'})

results = []

for art in articles:

    # get title
    title = art.find('div', {'class': 'art_title'}).contents[0]

    # get authors
    authorblock = art.find_all('a', {'class': 'entryAuthor'})
    authorString = ''
Example #25
0
 def get_coin_des(self):
     if self.page_type == 'coin_des':
         self.filter = SoupStrainer("div", class_="artBox")
         self.renew_soup(self.filter)
         self.coin_des = self.soup.text
Example #26
0
threads = list()
for i in range(len(obs_list)):
    x = threading.Thread(target=crawlThread, args=(i,))
    threads.append(x)
    x.start()

for i in range(8):
    x = threading.Thread(target=chartThread, args=(i,))
    threads.append(x)
    x.start()

for t in threads:
    t.join()

only_tables = SoupStrainer("table")

for i in range(len(obs_list)):
    soup = BeautifulSoup(source[i], "lxml",parse_only=only_tables)
    #table_div = soup.find(id="content_weather")
    tables = soup.find_all("table")
    wt_table = tables[1]
    trs = wt_table.find_all('tr')
    currTr = trs[row]
    tds = currTr.find_all('td')
    if (i < len(obs_list)-1):
        tmp = tds[col].text
        if (tmp=="\xa0"):
            tmp = "0"
        res.append(tmp)
    else:
Example #27
0
print('Enter genres of wallpaper you like')
while True:
    genere = input()
    generes[genere] = [1, 1]
    print('add more(yes/no)')
    choice = input()
    if choice != 'yes':
        break
file = open('temp.pickle', 'wb')
pickle.dump(generes, file)
file.close()
file = open('genere_count_data.pickle', 'wb')
print('counting the number of wallpapers in each genre')
for genere in generes:
    url = 'https://wall.alphacoders.com/search.php?search=' + genere
    only_h1_tags = SoupStrainer('h1')
    source_code = urllib.request.urlopen(url)
    source_code = source_code_shortner(source_code, 700, 800)
    soup = BeautifulSoup(source_code, 'html.parser', parse_only=only_h1_tags)
    re_str = str(soup.contents[0])
    result = re.search(' [0-9]* ', re_str)
    genere_wallpic_count[genere] = int(result.group())
    print(str(result.group()) + ' wallpapers found in ' + genere)
pickle.dump(genere_wallpic_count, file)
file.close()
print('Do you want to download initial wallpapers(It might take time)(yes/no)')
input = input()
if input == 'yes':
    # Download 10 wallpapers
    print('downloading initial wallpapers')
    for _ in range(10):
Example #28
0
 def __init__(self, *args, **kwargs):
     super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                             **kwargs)
     from bs4 import SoupStrainer
     self._strainer = SoupStrainer('table')
Example #29
0
 def __init__(self, page):
     only_main_content = SoupStrainer(id='main-content')
     self.dom = BeautifulSoup(page,
                              'html.parser',
                              parse_only=only_main_content)
Example #30
0
from bs4 import BeautifulSoup, SoupStrainer
import utils
import re

game = 'League'
url = 'https://leagueoflegends.fandom.com/wiki/Special:AllPages'
baseurl = 'https://leagueoflegends.fandom.com'

#%% Table of sections of all page list
# Get links to sections of all pages list to comb through for page links

# request page with target data
page = requests.get(url)

# filter the HTML content for the sections of page lists
allpagesStrain = SoupStrainer(class_="allpageslist")
allpagesSoup = BeautifulSoup(page.content, 'html.parser', parse_only=allpagesStrain)

allpagesList = []
# add page directories to a list
for link in allpagesSoup.find_all('a'):
    linkString = link.get('href')    
    allpagesList.append(linkString)
allpagesList = mylist = list(dict.fromkeys(allpagesList))

#%% Comb through the sections of the all page list to collect the pages
pageList = []
utils.printProgressBar(0,len(allpagesList),"parsing {} of {}".format(0, len(allpagesList)))
for idx,link in enumerate(allpagesList):    
    utils.printProgressBar(idx,len(allpagesList),"parsing {} of {}".format(idx+1, len(allpagesList)))
	# request page