Ejemplo n.º 1
0
def get_tiki_product_detail(self, data):
    logging.warning("Getting product data...")
    html_doc = data.get('raw_data')
    if html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
        categories_container = soup.findChild("ul", {"class": "breadcrumb"})
        categories = []
        for category in categories_container.find_all("a"):
            categories.append(category.text)

        title = soup.findChild("h1", {"id": "product-name"})
        gross_price = soup.findChild("span", {"id": "span-list-price"})
        net_price = soup.findChild("span", {"id": "span-price"})
        try:
            item_info = {
                "url":
                data.get('url'),
                "title":
                title.text.replace('\n', ''),
                "gross_price":
                get_price(gross_price.text)
                if gross_price else get_price(net_price.text),
                "net_price":
                get_price(net_price.text),
                "categories":
                categories,
                "type":
                "tiki",
                'task_id':
                self.request.root_id
            }
            elk_logger.info(msg="Saved item " + data.get('url'),
                            extra=item_info)
            return item_info
        except AttributeError as exc:
            print("Error while parsing data, crawl again...")
            celery_app.send_task("crawl_url",
                                 queue='priority.high',
                                 kwargs={
                                     'url': data.get('url'),
                                     'required_class': 'item-name',
                                     'label': 'tiki_crawling_product_detail',
                                 },
                                 countdown=30,
                                 link=get_tiki_product_detail.s(),
                                 expires=datetime.now() + timedelta(days=1))

    else:
        celery_app.send_task("crawl_url",
                             queue='priority.high',
                             kwargs={
                                 'url': data.get('url'),
                                 'required_class': 'item-name',
                                 'label': 'tiki_crawling_product_detail',
                             },
                             countdown=30,
                             link=get_tiki_product_detail.s(),
                             expires=datetime.now() + timedelta(days=1))
Ejemplo n.º 2
0
def get_table_header(soup: BeautifulSoup) -> List[str]:
    """ Table 의 Header 를 반환하는 함수

    Parameters
    ----------
    soup: BeautifulSoup
        Table의 BeautifulSoup Object

    Returns
    -------
    list of str
        Table Header
    """
    thead = soup.findChild('thead')
    if thead:
        thead_row = thead.findAll('tr')
        tags = 'th'
    else:
        thead = soup.findChild('tbody')
        thead_row = thead.findAll('tr')
        first_row = thead_row[0]
        max_row_span = 1
        for col in first_row.findAll('td'):
            row_span = int(col.attrs.get('rowspan', 1))
            if row_span > max_row_span:
                max_row_span = row_span
        thead_row = thead_row[0:max_row_span]
        tags = 'td'

    columns_name = []
    is_empty = []

    for idx, row in enumerate(thead_row):
        for col in row.findAll(tags):
            row_span = int(col.attrs.get('rowspan', 1))
            col_span = int(col.attrs.get('colspan', 1))

            if idx == 0:
                for _ in range(col_span):
                    columns_name.append(col.text)
                    if 1 < row_span:
                        is_empty.append(False)
                    else:
                        is_empty.append(True)
            else:
                start_index = 0
                for jdx, _ in enumerate(is_empty):
                    if is_empty[jdx] is True:
                        start_index = jdx
                        break

                for jdx in range(start_index, start_index + col_span):
                    columns_name[jdx] += col.text
                    is_empty[jdx] = False

    columns_name = [col.replace('\n', ' ') for col in columns_name]
    return columns_name
Ejemplo n.º 3
0
    def parse(self, response):
        sel = Selector(response)
        profile = {'url': response.url, 'skills': [], 'experience': []}

        # Parse current page URL (public profile URL)

        # Read Skills section
        skills_list = sel.xpath('//a[@class="endorse-item-name-text"]').extract()

        for skill in skills_list:
            skill = self.remove_tag('a', skill)
            profile['skills'].append(skill)

        # List of experience items
        exp_items = []

        # Read Companies and Titles
        exp_entries = sel.xpath('//div[contains(@id, "experience-") and contains(@id, "-view")]').extract()
        for exp_entry in exp_entries:
            b_soup = BeautifulSoup(exp_entry)

            #Get company name
            exp_company_matches = b_soup.findChildren('a', href=re.compile(r'prof-exp-company-name'))
            exp_company = exp_company_matches[len(exp_company_matches) - 1].get_text()\
                if len(exp_company_matches) > 0 else None

            # Get title within company
            exp_title = b_soup.findChild('a', {'name': 'title'}).get_text()

            # Get work description
            exp_desc_match = b_soup.findChild('p', {'class': 'description'})
            exp_desc = exp_desc_match.get_text() if exp_desc_match is not None else None

            # Get work date-locale
            exp_date_loc = b_soup.findChild('span', {'class': 'experience-date-locale'})

            exp_duration_items = exp_date_loc.findChildren('time')
            exp_is_current = 'Present' in exp_duration_items[1].get_text()
            exp_duration = re.sub(r'[^a-zA-Z0-9 ]', '', exp_duration_items[2].get_text()).strip()

            exp_location_item = exp_date_loc.findChild('span', {'class': 'locality'})
            exp_location = None
            if exp_location_item is not None:
                exp_location = re.sub(r'^[^"]*"', '', exp_location_item.get_text())
                exp_location = exp_location.replace("\"", "").strip()

            exp_items.append(ExperienceItem(exp_is_current, exp_title, exp_company,
                                            exp_location, exp_duration, exp_desc))

        profile['experience'] = exp_items

        # Sleep to appease LinkedIn rate limiting
        time.sleep(5)

        self.profile_map[response.url] = profile
        return LinkedInItem(profile)
Ejemplo n.º 4
0
def get_shopee_product_detail(self, data):
    logging.warning("Getting product data...")
    html_doc = data.get('raw_data')
    if html_doc:
        soup = BeautifulSoup(html_doc, 'html.parser')
        categories_html = soup.findAll("a", {"class": "JFOy4z _20XOUy"})
        categories = []
        for category_html in categories_html:
            categories.append(category_html.text)
        title = soup.findChild("span", {"class": "OSgLcw"})
        gross_price = soup.findChild("div", {"class": "_3_ISdg"})
        net_price = soup.findChild("div", {"class": "_3n5NQx"})
        try:
            item_info = {
                "url": data.get('url'),
                "title": title.text.replace('\n', ''),
                "gross_price": get_price(gross_price.text) if gross_price else get_price(net_price.text),
                "net_price": get_price(net_price.text),
                "categories": categories,
                "type": "shopee",
                'task_id': self.request.root_id
            }
            elk_logger.info(msg="Saved item " + data.get('url'), extra=item_info)
            return item_info
        except AttributeError as exc:
            print("Error while parsing data, crawl again...")
            celery_app.send_task(
                "crawl_url",
                queue='priority.high',
                kwargs={
                    'url': data.get('url'),
                    'required_class': '_3n5NQx',
                    'label': 'crawling_product_detail',
                },
                countdown=30,
                link=get_shopee_product_detail.s(),
                expires=datetime.now() + timedelta(days=1)
            )
    else:
        celery_app.send_task(
            "crawl_url",
            queue='priority.high',
            kwargs={
                'url': data.get('url'),
                'required_class': '_3n5NQx',
                'label': 'crawling_product_detail',
            },
            countdown=30,
            link=get_shopee_product_detail.s(),
            expires=datetime.now() + timedelta(days=1)
        )
Ejemplo n.º 5
0
 def get_title_and_problem_list(self):
     url = 'http://acm.hdu.edu.cn/contests/contest_show.php?cid=%d' % \
           self.contest_id
     r = self.session.get(url)
     soup = BeautifulSoup(r.text, 'html.parser')
     title = soup.findChild('h1').text
     if '- Team' in title:  # For unification, replace "- Team X" with "X"
         word = title.split(' ')
         title = ' '.join(word[:-3]) + ' ' + word[-1]
     print(title)
     table = soup.findChild('table')
     name_list = []
     for row in table.findChildren('tr'):
         name_list.append(row.findChildren('td')[-2].text)
     return title, name_list[1:]
Ejemplo n.º 6
0
	def getpdfByID(self,rgid):
		r=requests.get("https://www.researchgate.net/publication/"+str(rgid),headers=browserhdr,timeout=timeout_setting)
		if (r.status_code is 200):
			soup=BeautifulSoup(r.text, "html.parser")
			out=soup.findChild(name="a",attrs={"class":"blue-link js-download rf btn btn-promote"})
			link=''
			if (out):
				link=out['href']
			out=soup.findChild(attrs={'name':"citation_doi"})
			doi=""
			if (out):
				doi=out['content']
			filename=quotefileDOI(doi.lower().strip())
			return self.getpdfByLink(link,filename)
		return False
Ejemplo n.º 7
0
def get_dic_url():
    '''获取上海目录下小于100页的商圈拼音(用于拼接URL)'''
    req_url = raw_url.format(busi_area="")
    req = CheatRequests([[req_url]])
    content = req.get_cheat_first_content[0].decode("utf-8")
    bs = BeautifulSoup(content, "lxml")
    dic_list = bs.findChild("div", {
        "class": "filter-box"
    }).findChild("div", {
        "id": "filter-options"
    }).findChild("dl", {
        "class": "dl-lst clear"
    }).findChild("dd").findChild("div", {
        "class": "option-list"
    }).findChildren("a")
    dic_list = [
        re.findall("href=\"/zufang/(.+)/\"", str(dic))[0]
        for dic in dic_list[1:]
    ]

    # 保证每个地标中包含的页码数量小于100页
    busi_list_result = []
    dic_list_result = dic_list[:]
    for dic in dic_list:
        pages = get_pages(dic)
        if get_pages(dic) > 100:
            # 删除超过100页的内容
            dic_list_result.remove(dic)
            req_url = raw_url.format(busi_area=dic)
            req = CheatRequests([[req_url]])
            content = req.get_cheat_first_content[0].decode("utf-8")
            bs = BeautifulSoup(content, "lxml")
            busi_list = bs.findChild("div", {
                "class": "filter-box"
            }).findChild("div", {
                "id": "filter-options"
            }).findChild("dl", {
                "class": "dl-lst clear"
            }).findChild("dd").findChild("div", {
                "class": "option-list sub-option-list"
            }).findChildren("a")
            busi_list = [
                re.findall("href=\"/zufang/(.+)/\"", str(busi))[0]
                for busi in busi_list[1:]
            ]
            busi_list_result += busi_list
    dic_list_result += busi_list_result
    return dic_list_result
Ejemplo n.º 8
0
def get_highlight(search, index=0, list_index=False):
    response = json.loads(get(highlights_url.format(search=search)).text)
    urls = []
    title_index = {}
    idx = 1
    for doc in response['docs']:
        title = doc['title']
        urls.append((title, doc['url']))
        title_index[idx] = title
        idx = idx + 1
    if list_index:
        if title_index:
            body = ""
            for index, title in title_index.items():
                body += f"{index} - {title}" + "\n"
            return discord.Embed(title="Highlight Index", description=body)
        else:
            raise NoResultsError(f'No results for {search}')
    else:
        try:
            title = '**' + urls[index][0] + '**'
            video_url = urls[index][1]
        except IndexError:
            raise NoResultsError(f'No results for {search}')
        soup = BeautifulSoup(get(video_url).text, 'html.parser')
        video = soup.findChild(lambda tag: tag.name == 'meta' and tag.get(
            'itemprop') == 'contentURL' and tag.get('content').endswith('.mp4')
                               ).get('content')
        if video:
            return title, video
        else:
            raise NoResultsError(f'Error parsing video url for {search}')
def retrieveData(api_url):
    try:
        response = requests.get(api_url)
    except requests.exceptions.ConnectionError as e:
        print('Error', e.args)
        exit(1)

    #the whole html of website
    html = response.content

    # print(html)

    #initialize bsoup html parser
    soup = BeautifulSoup(html, 'html.parser')

    containerhtml = soup.findChild('div', class_='mainbody').findChild('div', class_='container')
    print(containerhtml)

    signupValidStr = containerhtml.find('h1')
    # print(signupValidStr)

    slotAvailability = containerhtml.find('h1')
    # print(slotAvailability)

    #if the signupgenius website no longer exists
    if "The Sign Up Was Not Found" in signupValidStr:
        return False

    #if the website has available spots for covid vaccine sign up
    if "NO SLOTS AVAILABLE. SIGN UP IS FULL." in slotAvailability:
        return False

    return True
Ejemplo n.º 10
0
    def download_icon(self):
        response = requests.get(self.url_for())
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        icon_url = soup.findChild("div", attrs={"class": "avatar-wrapper"}).findChild("img").get("src")

        return self._client.downloader.download(icon_url)
Ejemplo n.º 11
0
def scrap_lemonde(recherche):
    url = "https://www.lemonde.fr/recherche/?search_keywords={}".format(recherche)
    lemonde = requests.get(url)
    html_lemonde = BeautifulSoup(lemonde.text,'html.parser')
    contenu_recherche = html_lemonde.findChild("section",attrs={"class":"js-river-search"})
    #print(dir(contenu_recherche))
    list_titles = list()
    list_des = list()
    for section in contenu_recherche.children:

        if section.name == "section":
            if section.a:
                print("----------------------")
                title = section.a.h3.get_text()
                description = section.a.p.get_text()
                list_titles.append(title)
                list_des.append(description)
                # print(title)
                # print("***")
                # print(description)

    streamlit.sidebar.header("Scrapy le monde")
    streamlit.sidebar.info("Exos pour extraction d'information concernant le coronavirus")
    streamlit.error("errorrrrr")
    streamlit.title("Exo scrapy le monde coronavirus title et description")
Ejemplo n.º 12
0
def open_mobility_file():
    TEMPORAL_FILE = '/tmp/movement-data.csv'
    MOVEMENT_BASE_URL = 'https://data.humdata.org'
    MOVEMENT_URL = MOVEMENT_BASE_URL + '/dataset/movement-range-maps'

    if not os.path.exists(TEMPORAL_FILE):
        cdata = requests.get(MOVEMENT_URL)
        cdata = BeautifulSoup(cdata.text)

        links = cdata.findChild('div', {
            'id': 'data-resources-0'
        }).find_all('a')
        data_link = next(
            _ for _ in links
            if 'href' in _.attrs and _.attrs['href'].endswith('zip'))
        data_link = data_link.attrs['href']
        data_link = MOVEMENT_BASE_URL + data_link if data_link.startswith(
            '/') else data_link

        data_container = requests.get(data_link, stream=True)
        data_container = ZipFile(io.BytesIO(data_container.content))

        data_file = next(_ for _ in data_container.filelist
                         if _.filename.startswith('movement-range'))
        data_file = data_container.open(data_file.filename)

        with open(TEMPORAL_FILE, 'wb') as disk_file:
            shutil.copyfileobj(data_file, disk_file)

        data_file.close()

    return open(TEMPORAL_FILE)
Ejemplo n.º 13
0
def find_data_login(session, url, user, password):
    form = {'action': '', 'data':{}}
    data = form.get('data')
    req = session.get(url)
    soup = BeautifulSoup(req.content,'html.parser')
    if soup.findChild('input',{'type':'password'}):
        form_action = soup.find('form')
        action = form_action.get('action', '')
        method = form_action.get('method', '')
        form['action'] = action
        form['method'] = method

        for input in soup.find_all('input'):
            if input.has_attr('name'):
                if input.get('type').lower() == 'submit':
                    if input.has_attr('name'):
                        data[str(input.get('name'))] = str(input.get('value'))
                    else:
                        data[str(input.get('name'))] = 'submit'
                elif input.get('type').lower() == 'password':
                    data[str(input.get('name'))] = password
                elif input.get('type').lower() == 'text' and input.get('value') == None:
                    data[str(input.get('name'))] = user
                else:
                    data[str(input.get('name'))] = str(input.get('value'))  
    return form
Ejemplo n.º 14
0
 def _parse_request_to_movie(request):
     soup = BeautifulSoup(request.text, "html.parser")
     title = soup.findChild("h1", {"class": "filmCoverSection__title"}).text
     description_pl = soup.find("div", {
         "class": "filmPosterSection__plot"
     }).text
     premiere_year = soup.find("span", {
         "class": "filmCoverSection__year"
     }).text
     movie_time = soup.find("span", {
         "class": "filmCoverSection__filmTime"
     }).text
     movie_rating_value = soup.find("span",
                                    {"class", "filmRating__rateValue"}).text
     movie_rating_count = soup.find("span", {
         "class": "filmRating__count"
     }).text
     movie_details = {
         "title": title,
         "description_pl": description_pl,
         "premiere_year": premiere_year,
         "movie_time": movie_time,
         "movie_rating_value": movie_rating_value,
         "movie_rating_count": movie_rating_count
     }
     movie = Movie.parse(movie_details)
     return movie
Ejemplo n.º 15
0
def get_current_AMI_info():
    url = "https://aws.amazon.com/amazon-linux-ami/"
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page)
    tables = soup.findChild('table')
    rows = tables.findChildren(['th', 'tr'])
    return rows
Ejemplo n.º 16
0
def searchForClothesStockX(query):
    logger = logging.getLogger('mylogger')

    url = "https://stockx.com/search?s=" + query

    driver = webdriver.Firefox(
        executable_path='/mnt/c/GeckoDriver/geckodriver.exe')
    results = []
    try:
        driver.get(url)

        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located(
                (By.XPATH, '//div[@class="tile browse-tile"]')))
        scrollDownWindow(driver)

        #start extracting listings
        soup = BeautifulSoup(driver.page_source)
        soup = soup.find("div", {"class": "browse-grid"})
        listings = soup.findChild().contents

        for listing in listings:
            listingToAdd = extractInfo(listing)
            if listingToAdd:
                results.append(listingToAdd)
        return results
    except TimeoutException:
        raise ParseError
Ejemplo n.º 17
0
def get_blurb(first, last, sport, player_url=None):
    # for some weird reason its actually better to omit the first name in the search form
    response = get(player_url if player_url else blurb_search_url.format(first="", last=last, sport=sport))
    soup = BeautifulSoup(response.text, 'html.parser')
    # did we land a result page?
    if not soup.findChild('div', class_='RW_pn'):
        name_map = {}
        results_table = soup.find('table', attrs={'id':'cp1_tblSearchResults'})
        # filter results, omitting duplicate "position" links that don't include the player's name
        filtered_results = results_table.findChildren(lambda tag: tag.name == 'a' and 'player' in tag['href'] and len(tag.text) > 3)
        if not filtered_results:
            raise NoResultsError("No results for %s %s" % (first, last))
        else:
            for result in filtered_results:
                name = " ".join(result.text.split())
                name_map[result] = SequenceMatcher(None, first + " " + last, name).ratio()
        # sort names by similarity to search criteria
        sorted_names = sorted(name_map, key=name_map.get, reverse=True)
        return get_blurb(first, last, sport, player_url='http://www.rotoworld.com' + sorted_names[0].get('href'))
    else:
        news = soup.findChildren('div', class_='playernews')
        if news:
            recent_news = news[0]
            report = recent_news.find('div', class_='report')
            impact = recent_news.find('div', class_='impact')
            blurb = report.text + '\n\n' + impact.text
            return blurb
        else:
            raise NoResultsError("No recent player news for %s %s" % (first, last))
Ejemplo n.º 18
0
def get_tiki_products_url(self, data):
    # Return list product url from result page crawled
    html_doc = data.get('raw_data')
    soup = BeautifulSoup(html_doc, 'html.parser')
    items_container = soup.findChild("div", {"class": "product-box-list"})
    time.sleep(5)
    item_urls = items_container.find_all('a')[:10]
    urls = []
    for item_url in item_urls:
        if re.match(r"^https?:\/\/(w{3}\.)?tiki.vn\/.*?$",
                    item_url.get('href')):
            urls.append(item_url.get('href'))
    logging.warning("end loop...")

    from celery import chord
    chord(
        craw_tiki_url.subtask(queue='priority.high',
                              kwargs={
                                  'url': url,
                                  'required_class': 'item-name',
                                  'label': 'tiki_crawling_product_detail',
                              },
                              countdown=30,
                              link=get_tiki_product_detail.s(),
                              expires=datetime.now() + timedelta(days=1))
        for url in urls)(on_finish_crawl_tiki.s())

    response = {'search_url': data.get('url'), 'prods_urls': urls}
    elk_logger.info(response)

    return response
Ejemplo n.º 19
0
def get_postion(position_id, postion_dir):
    # 构造请求数据
    url = "https://www.lagou.com/jobs/%s.html" % (position_id)
    headers = {
        'Cookie':
        'TG-TRACK-CODE=index_search; SEARCH_ID=e8222f3471a44abf85093f79d876f759; JSESSIONID=ABAAABAABEEAAJA080B57268659EBB1C73E65E8835B1D1D; WEBTJ-ID=20181111160721-16701cf96c8949-0fc6f60feec025-48183706-1024000-16701cf96c94d2; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541944837; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541923641,1541944111; LGRID=20181111215853-ec514a2a-e5b9-11e8-9aa0-525400f775ce; LGSID=20181111214647-3bdcc9f7-e5b8-11e8-9a9f-525400f775ce; _ga=GA1.2.1318630155.1541923641; _gat=1; _gid=GA1.2.1216768844.1541923642; PRE_HOST=www.baidu.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xf57dfecd000124aa%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D0%26oq%3Dpython%252520%2525E6%252595%2525B0%2525E6%25258D%2525AE%2525E6%25258C%252596%2525E6%25258E%252598%26rsv_t%3De111J%252FhqMM1XxboP3SPnY%252Fw6ah3WItaAjhCUX2DgoGHa814Syn2DSmf%252F0Kh31gQZTnH%252B%26inputT%3D6259%26rsv_pq%3D918956f400061b60%26rsv_sug3%3D280%26rsv_sug1%3D254%26rsv_sug7%3D100%26bs%3Dpython%2520%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598; PRE_UTM=m_cf_cpt_baidu_pc; index_location_city=%E5%85%A8%E5%9B%BD; LGUID=20181111160538-9328c957-e588-11e8-9a22-525400f775ce; user_trace_token=20181111160538-9328c039-e588-11e8-9a22-525400f775ce',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Host': 'www.lagou.com',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15',
        'Accept-Language': 'zh-cn',
        'Referer': 'https://www.lagou.com/',
        'Connection': 'keep-alive'
    }
    session = requests.Session()
    session.headers = headers
    response = session.get(url)
    doc = response.content.decode()
    soup = BeautifulSoup(doc, 'html.parser')
    # soup = BeautifulSoup(response.content,'html.parser',from_encoding = 'utf-8')
    try:
        re = soup.findChild(name='dd', class_='job_bt')
        rr = re.find("div").find_all("p")
        if not os.path.exists(postion_dir + "/"):
            # print("不存在")
            os.mkdir(postion_dir + "/")
        with open(postion_dir + "/%s.txt" % (position_id), 'w') as fd:
            for p in rr:
                # print(p)
                fd.write(p.text + "\n")
    except:
        pass
Ejemplo n.º 20
0
    def test_only_the_custom_region_is_created(self):
        caption_set = DFXPReader().read(
            SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT)

        new_region = Layout(
            alignment=Alignment(
                HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP
            )
        )

        dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
        # Using a different parser, because this preserves letter case
        # The output file is ok, but when parsing it, the "regular" parses
        # loses letter case.
        layout = BeautifulSoup(dfxp, features='xml').findChild('layout')

        self.assertEqual(len(layout.findChildren('region')), 1)

        region = layout.findChild('region')
        text_align = region['tts:textAlign']
        display_align = region['tts:displayAlign']

        internal_alignment = _create_internal_alignment(text_align, display_align)  # noqa
        self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT)  # noqa
        self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP)  # noqa
Ejemplo n.º 21
0
def retrieveData(api_url):
    try:
        response = requests.get(api_url)
        #print("--- response headers: ",response.headers)
    except requests.exceptions.ConnectionError as e:
        print('Error', e.args)
        exit(1)

    html = response.content
    # print("--- html: ", html)

    # parsing html with BS
    soup = BeautifulSoup(html, 'html.parser')

    elements = soup.findChild(
        "ol", id="accordion-rankings-184224").findChildren("li")
    datalist = []
    for i in range(0, len(elements)):
        row = []
        univ_name = elements[i].find('span').getText()
        # print (univ_name)
        row.append(univ_name)
        location = elements[i].find('p').getText()
        row.append(location)
        row.append(i+1)  # append ranking, which is the iteration
        desc = elements[i].findChild(
            "div", class_="inner-content").find('p').getText()
        row.append(desc)
        url = elements[i].findChild('a')["href"]
        row.append(url)  # url of university
        row.append(api_url)  # url of the site
        # print(row)
        datalist.append(row)
    return datalist    
Ejemplo n.º 22
0
    def __init__(
        self,
        video_game: str,
        event: str = "",
        player1: str = "",
        player2: str = "",
        character1: str = "",
        character2: str = "",
        caster1: str = "",
        caster2: str = "",
        num_workers: int = 10,
        num_page_workers: int = 2,
        verbose: bool = False,
    ):
        self.base_url = self.URL(video_game, event, player1, player2,
                                 character1, character2, caster1, caster2)

        self.num_workers = num_workers
        self.num_page_workers = min(num_page_workers, self.num_workers)
        self.session = FuturesSession(max_workers=self.num_workers)

        page_content = self.request(str(self.base_url)).result().content
        page_soup = BeautifulSoup(page_content, "lxml")

        self.num_pages = 1
        last_page_tag = page_soup.findChild("a", title="Go to last page")
        if last_page_tag:
            self.num_pages = int(
                re.search(r"page=([\d]+)", last_page_tag["href"]).group(1))

        self.verbose = verbose
Ejemplo n.º 23
0
    def scrape_vod_page(
            self, vod_id: str,
            vod_request: Future) -> Tuple[List[str], List[Vod.Caster]]:
        vod_content = vod_request.result().content
        vod_strainer = SoupStrainer("div", class_="region-inner clearfix")
        vod_soup = BeautifulSoup(vod_content, "lxml", parse_only=vod_strainer)
        content = vod_soup.findChild(recursive=False)

        try:
            video_ids = [
                re.search(r"^([^?]*)", v["data-vod"]).group(1)
                for v in content.findChildren(
                    "div", class_="js-video widescreen", recursive=False)
            ]
            if len(video_ids) == 0:
                raise InvalidVideoError(vod_id)

            casters = []
            casters_tag = content.findChild("div", class_="field-items")
            if casters_tag:
                casters = [
                    Vod.Caster(c.getText())
                    for c in casters_tag.findChildren(recursive=False)
                ]
            return (video_ids, casters)
        except KeyError:
            raise InvalidVideoError(vod_id)
Ejemplo n.º 24
0
    def test_only_the_custom_region_is_created(self):
        caption_set = DFXPReader().read(
            SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT)

        new_region = Layout(alignment=Alignment(HorizontalAlignmentEnum.LEFT,
                                                VerticalAlignmentEnum.TOP))

        dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
        # Using a different parser, because this preserves letter case
        # The output file is ok, but when parsing it, the "regular" parses
        # loses letter case.
        layout = BeautifulSoup(dfxp, features='xml').findChild('layout')

        self.assertEqual(len(layout.findChildren('region')), 1)

        region = layout.findChild('region')
        text_align = region['tts:textAlign']
        display_align = region['tts:displayAlign']

        internal_alignment = _create_internal_alignment(
            text_align, display_align)  # noqa
        self.assertEqual(internal_alignment.horizontal,
                         HorizontalAlignmentEnum.LEFT)  # noqa
        self.assertEqual(internal_alignment.vertical,
                         VerticalAlignmentEnum.TOP)  # noqa
Ejemplo n.º 25
0
def main():
    #BOROUGH OF JAMESBURG
          def ToGetItIn_manner(datasets):
                    for data in datasets:
                              p=str(data)
                              p=p.splitlines()
                              p=filter(None,p)
                              st=str(p)
                              st1=st.strip('[')
                              st2=st1.strip(']')
                              st3=st2.strip("'")
                              st4=str(st3.replace("'", ""))
                              print " ".join(st4.split())
          datasets=[]
          url=raw_input("enter the url")
          #url="http://www.southrivernj.org/elected_officials.html"
          opener=urllib2.build_opener()
          print "Builded opener"
          opener.addheaders=[('user_agent','Chrome/41.0.2243.0')]
          print "added headers"
          response=opener.open(url)
          print "Took Response"
          soup=BeautifulSoup(response)
          BoroughOfJamesburg=soup.findChild("div",{'id':'content'})
          for row in BoroughOfJamesburg.find_all(['p']):
                    if (True):
                    #print row.get_text().encode('utf-8')
                              dataset = row.get_text().encode('utf-8')
                              datasets.append(dataset)
          for row in BoroughOfJamesburg.find_all('div'):
                    dataset = row.get_text().encode('utf-8')
                    datasets.append(dataset)

          #print datasets
          ToGetItIn_manner(datasets)
Ejemplo n.º 26
0
def load_annotation(xmlFile):
    """
    Read annotations from for a image from xml file and return a dictionary of
    the objects and their locations
    """
    with open(xmlFile) as f:
        xml = f.readlines()
    xml = ''.join([line.strip('\t') for line in xml])
    anno = BeautifulSoup(xml, "html5lib")
    anno_dic = {}
    fname = anno.findChild('filename').contents[0]
    anno_dic['filename'] = fname
    objs = anno.findAll('object')
    # print('Number of objects:', len(objs))
    objects = []
    for obj in objs:
        obj_name = obj.findChild('name').contents[0]
        bbox = obj.findChildren('bndbox')[0]
        xmin = int(bbox.findChildren('xmin')[0].contents[0])
        ymin = int(bbox.findChildren('ymin')[0].contents[0])
        xmax = int(bbox.findChildren('xmax')[0].contents[0])
        ymax = int(bbox.findChildren('ymax')[0].contents[0])
        obj_dic = {'object_name': obj_name,
                   'location': np.array([xmin, ymin, xmax, ymax])}
        objects.append(obj_dic)
    anno_dic['annotation'] = objects
    return anno_dic
Ejemplo n.º 27
0
    def get_page_links(self, date: str) -> List[str]:
        """
        Page Links method
        =================
        function to get the page link for date

        Arguments
        ---------
        date: str
            date such as "%Y-%m-%d"("2021-05-03")

        Returns
        -------
        page_links: list of str
            ex. ['url1', 'url2', ...]
        """
        req = requests.get(self.main_url + date)  # HTTP GET Request
        soup = BeautifulSoup(req.text, "html.parser")

        page_links = soup.findChild("table", {"class": "Nnavi"})
        page_links = page_links.find_all("a")
        page_links = [link["href"] for link in page_links]
        page_links = list(set(page_links))
        page_links.sort()

        return page_links
Ejemplo n.º 28
0
class BingPage:
    def __init__(self, page):
        self.soup = BeautifulSoup(page, 'html.parser')

    def next_url(self):
        url = self.soup.findChild(class_='sb_pagN')
        return url.get('href') if url else None

    def get_results(self):
        results = self.soup.find_all(class_='b_algo')
        infos = []
        for result in results:
            infos.append(self._parse_result(result))
        return infos

    def _parse_result(self, result):
        info = {'title': '', 'abstract': '', 'link': ''}
        info['title'] = self._text(result.h2)
        caption = result.findChild(class_='b_caption')
        if caption:
            info['abstract'] = self._text(caption.findChild('p'))
            info['link'] = self._text(caption.cite)
        return info

    def _text(self, node):
        return node.getText() if node else ''
Ejemplo n.º 29
0
def get_seats(ticket_href):
    seats_total = 0
    seats_available = 0

    try:
        req = requests.post(PATHE_URL +
                            ticket_href)  # Get special URL redirection
        req = requests.get(req.url + "/stoelen")
    except Exception:
        return seats_available, seats_total  # Some error occurred during web request.

    if ticket_href in req.url:
        return seats_available, seats_total  # Most likely you can't buy tickets anymore because the movie started.
    if req.status_code == 404:
        return seats_available, seats_total  # Most likely you can't buy tickets anymore because the movie started.
    if req.status_code == 500:
        return seats_available, seats_total  # Most likely there are no seats for this movie. (Known possibility: Drive-In Cinema)

    soup = BeautifulSoup(req.text, "html.parser")
    try:
        seats = soup.findChild("ul", {"id": "seats"}).findChildren("li")
    except AttributeError:  # Unknown error..
        print("\nError at:")
        print(req.status_code)
        print(req.url)
        print("")
        return seats_available, seats_total

    for seat in seats:
        seats_total += 1
        try:
            seat["class"]
        except KeyError:
            seats_available += 1
    return seats_available, seats_total
Ejemplo n.º 30
0
    def stream(self, records):
        for record in records:
            soup = BeautifulSoup(record[self.textfield], self.parser)
            if self.find:
                if self.find_attrs is not None:
                    soup = soup.find(
                        self.find, 
                        literal_eval('{'+self.find_attrs+'}')
                    )
                else:
                    soup = soup.find(self.find)
            if self.find_all:
                if self.find_all_attrs is not None:
                    soup = soup.find_all(
                        self.find_all, 
                        literal_eval('{'+self.find_all_attrs+'}')
                    )
                else:
                    soup = soup.find_all(self.find_all)
            if self.find_child:
                if self.find_child_attrs is not None:
                    soup = soup.findChild(
                        self.find_child, 
                        literal_eval('{'+self.find_child_attrs+'}')
                    )
                else:
                    soup = soup.findChild(self.find_child)
            if self.find_children:
                if self.find_children_attrs is not None:
                    soup = soup.findChildren(
                        self.find_children, 
                        literal_eval('{'+self.find_children_attrs+'}')
                    )
                else:
                    soup = soup.findChildren(self.find_children)
            if self.get_text and not (self.find_all or self.find_children):
                record[self.get_text_label] = \
                    soup.get_text().decode('unicode_escape').encode('ascii','ignore')
            elif self.get_text and (self.find_all or self.find_children):
                record[self.get_text_label] = [
                    i.get_text().decode('unicode_escape').encode('ascii','ignore')
                    for i in soup
                ]
            else:
                record['soup'] = soup

            yield record
Ejemplo n.º 31
0
    def get_metadata(self, url_list: List[str]) -> List[Dict]:
        """
        Metadata method
        ================
        function to get meta data of news

        Arguments
        ---------
        url_list: list of str
            url to crawl metadata

        Returns
        -------
        metadata: list of dict
            [{'press': press(str),
            'date': publish date(str),
            'time': publish time(str),
            'title': news title(str),
            'link': news url(str)},...]
        """
        metadata = []
        for url in url_list:
            # bs4
            req = requests.get(self.root_url + url)  # HTTP GET Request
            soup = BeautifulSoup(req.text, "html.parser")
            # 언론사
            press_list = soup.find_all("span", {"class": "press"})
            press_list = [press.text.strip() for press in press_list]

            # 발간일
            p_datetime_list = soup.find_all("span", {"class": "wdate"})
            p_datetime_list = [
                p_datetime.text for p_datetime in p_datetime_list
            ]

            # 제목 및 링크
            news_links = soup.findChild("div", {"class": "mainNewsList"})
            news_links = news_links.find_all("a")
            news_titles = [link.text for link in news_links if link.text]
            news_links = [link["href"] for link in news_links]
            news_urls = []
            for news_link in news_links:
                if news_link not in news_urls:
                    news_urls.append(news_link)

            for press, p_datetime, title, link in zip(press_list,
                                                      p_datetime_list,
                                                      news_titles, news_urls):
                p_date, p_time = p_datetime.split()
                meta_dict = {
                    "press": press,
                    "date": p_date,
                    "time": p_time,
                    "title": title,
                    "link": f"{self.root_url}{link}",
                }
                metadata.append(meta_dict)

        return metadata
Ejemplo n.º 32
0
def gz_thread(url, expected_price):
    req = gen_req(url)
    r = urllib2.urlopen(req).read()
    soup = BeautifulSoup(r, "html.parser")
    # print soup.prettify()
    div = soup.findChild('div', class_='t_fsz')
    content = div.findChild('font').string
    return content
def add_to_type_set(xml_obj: BeautifulSoup):
    """expects bf object"""
    tag_name = xml_obj.findChild().name
    global xml_types
    if tag_name not in xml_types:
        xml_types[tag_name] = xml_obj
    else:
        merge_xml(xml_obj, xml_types[tag_name])
 def load_bundles(self):
     responseText = self.session.get(
         'https://itch.io/my-purchases/bundles').text
     soup = BeautifulSoup(responseText, 'html.parser')
     self.bundles = {}
     for bundle in soup.findChild('section', attrs={
             'class': 'bundle_keys'
     }).findChildren('a'):
         self.bundles[
             bundle.getText()] = 'https://itch.io' + bundle.get('href')
Ejemplo n.º 35
0
def get_paragraph(filename):
    b = BeautifulSoup(open(filename))
    contents = b.findChild(attrs={"name": "navercast_div"})

    def x(item):
        return isinstance(item, Tag)
        # filter string

    contents = filter(x, contents)
    return contents
Ejemplo n.º 36
0
  def parse_recipe(self, response):      
      #Retrieves all relevant data from ruled.me
      got_data = requests.get(response.url)
      html=got_data.content
      soup = BeautifulSoup(html, 'lxml')

      recipe = RecipeItem()
      #Hard-coded for now. consider parsing the url to get lunch       

      recipe['time'] = "snacks"
      
      #get dataframe from html table.
      table = pd.read_html(html, header=0, index_col=0, flavor="bs4", encoding="utf-8")
      dataframe = [];
      for tab in table:
        dataframe.append(tab)
 
      #get metadata on recipe
      
      recipe['rawTable'] = dataframe

      meta = soup.find('div', attrs={'class': 'entry-content'})
      meta_titleParent = soup.find('div', attrs={'class': 'articleTitle'})
      meta_title = meta_titleParent.find('h1').text
      recipe['title'] = meta_title

      imageParent = soup.findChild('div', attrs={'class': 'postImage_f'})
      image = imageParent.find('img')['src']
      recipe['image_urls'] = [image]

      date_parent = soup.find('div', attrs={'class': 'articleData'})
      date_dirty = date_parent.text
      date_final = date_dirty.split('on ')[1]
      recipe['date'] = date_final
      recipe['ingredients'] = []

      yield recipe
Ejemplo n.º 37
0
def parsePage(c, page, idx):
    """
    Parses a single page of the operone dictionary:
    Gets the page source:
    fixes problematic html tags(not closed or wrongly placed mostly)
    for every entry on the page:
        runs parseExceptions on the line
        takes the different parts of the line.
        simplifies and copys the transformed versions.
        writes the entry to the database
    """
    page = urllib.request.urlopen(urllib.request.Request(operoneBaseUrl + page.get('href'))).read()
    page = page.decode('ISO-8859-1') # encoding of the operone pages
    page = html.unescape(page)
    #page = str(page) # cast to string (from stream)
    #page = html.unescape(page) # unescape greek letters
    page = fixBadHtml(page)

    # this automatically adds closing span tags at the end of a line if none are present
    pageSoup = BeautifulSoup(page, 'html.parser')
    lis = pageSoup.find_all('li')

    for element in lis:
        #print(element)
        # ok, we are getting additional translations or variations that belong to the word
        # for now we will be ignoring them by using only the first child.

        correctedLine = parseExceptions(str(element))
        element = BeautifulSoup(correctedLine, 'html.parser')

        # get_text can strip whitespaces but since we need the comma stripped as well
        # it makes more sense to put both into one context.
        # vocab is the raw string of the entry
        vocab = element.findChild().findChild().get_text().strip(', ')

        # apply fix dict for vocab part
        for entry in VOCAB_FIX_DICT:
            vocab = vocab.replace(entry, VOCAB_FIX_DICT[entry])

        # versions is a list of the different lookup words of the entry
        versions = [version.strip() for version in vocab.split(',')]
        # main is the first version - we will just assume that this is what we want ...
        main = versions[0]
        # alternate are all other versions concatenated by commas.
        alternate = ",".join(versions[1:])
        # start index behind the first span. used to separate
        # the lookup word from the translation since there can be 
        # greek letters and tags in the translation (otherwise we could just the text with recursive=False to eliminate text in tags)
        tlStartIndex = str(element).find('</span>')
        tlStartIndex += len('</span>')

        # ok, so here we take off the first part of the entry which
        # contains the greek word. Then we feed the remaining string into 
        # a new BeautifulSoup instance and strip remaining tags in the translation
        # with the get_text() method.
        subText = BeautifulSoup((str(element))[tlStartIndex:], 'html.parser')
        translation = str(subText.get_text()).strip()
        #pageNum = idx
        rough = greek_to_ascii(greek_simplify(main), False)
        precise = greek_to_ascii(greek_simplify(main), True)
        c.execute('INSERT INTO operonedict VALUES(?, ?, ?, ?, ?)',(rough, precise, main, alternate, translation,))
        #print(translation)

    return len(lis)
Ejemplo n.º 38
0
class EndnoteXML(object):
	def __init__(self,fname):
		if (fname):
			f=open(fname)
			self.content=re.sub(r'</?style.*?>','',f.read())
			f.close()
		else:
			self.content=""
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')			
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	#def __repr__(self):
	#	return self.soup.encode()

	def __str__(self):
		return self.soup.encode()

	def reset(self,fname):
		self.__init__(fname)

	def read(self,fname):
		self.__init__(fname)

	def reads(self,s):
		self.content=s
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	def writes(self,encoding='utf-8'):
		return self.soup.encode(encoding=encoding)

	def write(self,fname,encoding='utf-8'):
		f=open(fname,'w')
		f.write(self.writes(encoding=encoding))
		f.close()

	def getrecord(self,num):
		if (num>=self.length):
			return None
		return self.records[num]

	def checktag(self,num,tag):
		if self.records[num].find(tag) is None:
			self.addtag(num,tag,value='')

	def addtag(self,num,tag,value=None,parent=None):
		'''value can be string, tag'''
		a=self.soup.new_tag(tag)
		if value: a.string=value
		if parent:
			self.records[num].find(parent).append(a)
		else:
			self.records[num].append(a)

	def gettag(self,num,tag,parent=None,obj=False):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					if (obj):
						return self.records[num].find(parent).find(tag)
					else:
						return self.records[num].find(parent).find(tag).string
				else:
					return ''
			else:
				return ''
		else:
			if self.records[num].find(tag):
				if (obj):
					return self.records[num].find(tag)
				else:
					return self.records[num].find(tag).string
			else:
				return ''

	def settag(self,num,tag,value,parent=None):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					self.records[num].find(parent).find(tag).string=value
				else:
					self.addtag(num,tag,parent=parent,value=value)
			else:
				a=self.soup.new_tag(tag)
				a.string=value
				self.addtag(num,parent,parent=None,value=a)
		else:
			if self.records[num].find(tag):
				self.records[num].find(tag).string=value
			else:
				self.addtag(num,tag,parent=None,value=value)	

	def getpath(self):
		db=self.soup.findChild("database")
		if (db):
			return os.path.splitext(db['path'])[0]+'.Data'
		else:
			return ""

	def getdoi(self,num):
		doistr=self.gettag(num,"electronic-resource-num")
		if (doistr):
			doiindex=doistr.find('10.')
		else:
			doiindex=-1
		if (doiindex >=0):
			return doistr[doiindex:].lower().strip()
		else:
			return ""

	def setdoi(self,num,value):
		self.settag(num,"electronic-resource-num",value)

	def gettitle(self,num):
		return self.gettag(num,"title")

	def settitle(self,num,value):
		self.settag(num,"title",value)

	def getjournalfull(self,num):
		return self.gettag(num,'secondary-title')

	def getyear(self,num):
		return self.gettag(num,'year','dates')

	def setyear(self,num,value):
		self.settag(num,'year',value,'dates')

	def getvolume(self,num):
		return self.gettag(num,'volume')

	def setvolume(self,num,value):
		self.settag(num,'volume',value)

	def getissue(self,num):
		return self.gettag(num,'number')

	def setissue(self,num,value):
		self.settag(num,'number',value)

	def getpages(self,num):
		return self.gettag(num,'pages')

	def setpages(self,num,value):
		self.settag(num,'pages',value)

	def getnotes(self,num):
		return self.gettag(num,'notes')

	def setnotes(self,num,value):
		self.settag(num,'notes',value)

	def geturl(self,num):
		urls=self.gettag(num,'related-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def seturl(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('related-urls') is not None):
			urls=self.gettag(num,'related-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'related-urls',parent='urls')
		self.addtag(num,'url',value,'related-urls')

	def addurl(self,num,value,first=False):
		urls=self.gettag(num,'related-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.settag(num,'related-urls',a,'urls')

	def getpdf(self,num):
		urls=self.gettag(num,'pdf-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def setpdf(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		self.addtag(num,'url',value,'pdf-urls')

	def setpdfs(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		for url in value:
			self.addtag(num,'url',url,'pdf-urls')

	def addpdf(self,num,value,first=False):
		urls=self.gettag(num,'pdf-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.addtag(num,'pdf-urls',a,'urls')

	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)

	def preprocess(self):
		pass

	def cleannote(self,num):
		note=self.getnotes(num)
		notel=note.lower()
		if ("time" in notel):
			self.setnotes(num,notel[notel.find('time'):])

	def cleanallpdf(self,exceptOAPDF=True):
		'''Clean PDF record or except OAPDF record'''
		for i in range(self.length):
			if (not exceptOAPDF):
				self.setpdf(i,'')
			else:
				for pdf in self.getpdf(i):
					if "internal-pdf://OAPDF/" in pdf:
						self.setpdf(i,pdf)
						break

	def process(self,fname="",cleannote=False,prefix='',issn='',start=0):
		epath=self.getpath()
		print "Output",self.length,"to",epath+os.sep+fname
		for i in range(start,self.length):
			try:
				#if (i%100 is 0):
				#	print
				#	print "Doing:",i+1,
				#else:
				#	print i+1,

				pdfs=self.getpdf(i)
				urls=self.geturl(i)
				# Fast consider as record process before
				hasfound=False
				for pdf in pdfs:
					if "internal-pdf://OAPDF/" in pdf:
						hasfound=True
						doistr=self.gettag(i,"electronic-resource-num")
						if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
							doi=DOI(self.getdoi(i))
							if doi:
								self.setdoi(i,"chk: "+doi)
						break
						
				if not hasfound:
					for url in urls:
						if "http://oapdf.sourceforge.net/cgi-bin/" in url:
							hasfound=True
							doistr=self.gettag(i,"electronic-resource-num")
							if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
								doi=DOI(self.getdoi(i))
								if doi:
									self.setdoi(i,"chk: "+doi)
							break
				if hasfound:
					continue

				if (cleannote):
					self.cleannote(i)

				doistr=self.gettag(i,"electronic-resource-num")
				if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
					doi=DOI(self.getdoi(i))
				else:
					doi=DOI(self.finddoi(i,prefix=prefix,issn=issn))
					if doi:
						self.setdoi(i,"chk: "+doi)
				oapdflink=""
				if (doi and doi.is_oapdf()):
					oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi

				newpdfs=[]
				for pdf in pdfs:
					pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep)
					relpath=pdf.replace("internal-pdf://","")
					# should never happen
					if (relpath == doi.quote()+".pdf"):
						newpdfs.append(pdf)
						continue
					if (doi):
						if (os.path.exists(pdfpath)):
							try:
								os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
								newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
							except:
								print "Can't rename:",pdf,'to',doi.quote()+".pdf"
								newpdfs.append(pdf)
								continue
						else:
							print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..',
							pdfdir=os.path.split(pdfpath)[0]
							if (os.path.exists(pdfdir)):
								fs=glob.glob(pdfdir+os.sep+'*.pdf')
								if (len(fs)==1):
									try:
										os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
										newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
										print "Find",fs[0],'and rename!'
									except:
										print "Can't rename:",fs[0],'to',doi.quote()+".pdf"
										newpdfs.append(pdf)
										continue
								else:
									print "Can't find.."
									newpdfs.append(pdf)
									continue
							else:
								newpdfs.append(pdf)
								continue
					else:
						print "Blank doi for file:",pdf
						newpdfs.append(pdf)
						continue
				if (oapdflink):
					newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf")
				self.setpdfs(i,newpdfs)
				# Set the urls
				if (oapdflink and oapdflink not in urls):
					self.addurl(i,oapdflink,first=True)
			except Exception as e:
				print "Error at ", i, 'since: ',e
				#return 1
		if fname:
			self.write(fname)
		return 0
Ejemplo n.º 39
0
def touchpage(origin='.', doilink='../doilink',pdf=True,force=False):
	# Use to save local page record
	if not os.path.exists(doilink):
		os.makedirs(doilink+os.sep+'pages')
	doilink=doilink.rstrip('/').rstrip('\\')
	sfurl="http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf"

	workdir=os.path.abspath(origin).rstrip('\\').rstrip('/')
	count=0
	touchcount=1 # avoid submit when start
	forcesf=force # force to overwrite the exist doilink page

	if (pdf):
		result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.pdf')) for x in os.walk(workdir)))
	else:
		result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.html')) for x in os.walk(workdir)))

	toappend=[]
	newtouch=0
	for f in result:
		if (touchcount%50==0):
			r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120)
			if (r.status_code == 200):
				bs=BeautifulSoup(r.text,"html.parser")
				totaldid=bs.findChild('span',attrs={'id':'total'})
				if totaldid and totaldid.text :
					newtouch+=int(totaldid.text)
				del toappend[:]
			else:
				print "Maybe Error when submit to SF-OAPDF.."
				sys.exit(1)
		count+=1
		fname=filebasename(f)
		if (' ' in fname):
			print "File name has blank!",f
			os.renames(f,os.path.split(f)[0]+os.sep+fname.strip()+os.path.splitext(f)[1])
			fname=fname.strip()
		doi=DOI(fname)
		if (doi):
			dirname=doilink+"/pages/"+doi.decompose(url=False, outdir=True)
			if (forcesf or not os.path.exists(dirname+fname+'.html')):
				touchcount+=1
				toappend.append(doi)
				try:
					if (not os.path.exists(dirname)): os.makedirs(dirname)
					f=open(dirname+fname+'.html',"w")
					f.close()
				except WindowsError as e:
					print e
				except:
					print "Something error for file:",f
		else:
			print "File name may be error (Not DOI name):",fname

	r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120)
	if (r.status_code == 200):
		bs=BeautifulSoup(r.text,"html.parser")
		totaldid=bs.findChild('span',attrs={'id':'total'})
		if totaldid and totaldid.text :
			newtouch+=int(totaldid.text)
		del toappend[:]
	else:
		print "Maybe Error when submit to SF-OAPDF.."
		sys.exit(1)
	print "Process total file:",count,"; local touch new:",touchcount-1, "; remote touch:",newtouch
Ejemplo n.º 40
0
//This script is to download pics from funnie.st site

import requests
from bs4 import BeautifulSoup
from progressbar import *

link=raw_input('Paste the link here: ')
res=requests.get(link)
data=BeautifulSoup(res.content)
Total_image=int(data.findChild('h3').string[-2:])

for pic in range(Total_image):
	res=requests.get(link)
	data=BeautifulSoup(res.content)
	next_image=data.findChildren('link')
	image_name=link[24:-3]+link[-2:-1]
	for img in next_image:
		if 'next' in img['rel']:
			link=img['href']
	image=data.findChildren('img')

	for i in image:
		if 'jpg' in i['src']:
			r=requests.get(i['src'])
			size=float(r.headers['content-length'])
			mbSize = 1024*1024	#used for conversion to Mb
			TotalSize = (size)/mbSize
			widgets = ['Test: ', Percentage(), ' ', Bar(">"), ' ', ETA(), ' ', FileTransferSpeed()]
			progress = ProgressBar(widgets=widgets,maxval=TotalSize)
			progress.start()
Ejemplo n.º 41
0
from bs4 import BeautifulSoup, NavigableString, Tag
import csv
import json
import time
import sys

pages = 5
page = 0
athletes = []

while page < pages:
    result = urllib2.urlopen("http://www.teamusa.org/athletes?pg=" + str(page))
    soup = BeautifulSoup(result)

    # The list of athletes on a given page is in a <ul> with the class 'thumb-row athletes'
    content = soup.findChild(class_="thumb-row athletes")

    # Each <li> contains the info about an athlete
    for each in content.find_all("li"):
        athlete = {}

        # Get name
        try:
            name = "".join(each.findChild("h4").string)
            name = name.replace("\n", "").replace("\r", "").strip().encode("utf8")
            athlete["name"] = name
        except AttributeError:
            continue

        # Get link to profile
        try: