Example #1
0
    def get_movies_near_you(self):
        """
        Returns the list of movies playing near you, including the theaters
        and showtimes for each movie
        """

        soup = get_soup('/showtimes/')
        list_movies = soup.find_all(class_='list_item')
        movies = []

        for item in list_movies:
            movie = {}

            page = item.find('a')['href']
            soup = get_soup(page)
            movie_info = soup.select('.article > .article .overview-top')[0]
            
            # fill in the movie info
            movie['title'] = movie_info.h4.a.text
            movie['url'] = movie_info.h4.a['href']
            movie['runtime'] = movie_info.p.time.text
            movie['ratingValue'] = movie_info.select(
                '.rating_txt meta[itemprop="ratingValue"]')[0]['content']
            movie['description'] = movie_info.select('.outline')[0]\
                .text.strip()

            # add the list of theaters where the movie is playing
            movie['theaters'] = []
            list_theaters = soup.find_all(class_='list_item')

            for item in list_theaters:
                theater = {}
                theater['name'] = item.select('h3 > a > span')[0].text

                # parse address anc contact info
                properties = [
                    ('address', 'streetAddress'), ('city', 'addressLocality'), 
                    ('postalCode', 'postalCode'), ('phone', 'telephone'),
                ]
                for obj_prop, item_prop in properties:
                    theater[obj_prop] = item.select(
                        '.address span[itemprop="%s"]' % item_prop
                    )[0].text

                # add today's showtimes
                showtimes = item.find(class_='showtimes').select(
                    'meta[itemprop="startDate"]')
                theater['showtimes'] = [x['content'] for x in showtimes]

                movie['theaters'].append(theater)

            movies.append(movie)
        return movies
Example #2
0
    def get_articles(self, issue=''):
        """
			Yields a list of articles from the given issue.
        """	
		
		
	soup = get_soup() # get soup of all articles 
	issues = soup.find_all('ul')
		
	# validating and assigning default value for issue
	if not type(issue) is int or issue < 0 :
		issue = 1
	if issue > len(issues):
		issue = len(issues)		
		
	# considering latest article is last element 		
	articles = issues[len(issues)-issue].find_all('a') 
	mArticles = []
	for article in articles:
		mArticle = {}
		mArticle['link'] = article.get('href')[1:]
		mArticle['title'] = article.find('li').contents[0].strip()
		mArticle['author'] = article.find('span').contents[0].encode('utf8')
		mArticles.append(mArticle)
	return mArticles
Example #3
0
 def get_weekly_summary(self, alias):
     
     url = self.BOMURL + "/?page=weekly&id=" + self.alias + ".htm"
     soup = utils.get_soup(url)
     if soup is not None:
         return movie.Weekly(soup)
     else:
         print "Not able to parse url: " + url
         pass
 def get_weekly_summary(self, url_or_id):
     if 'http' in url_or_id.lower():
         soup = utils.get_soup(url_or_id)
         if soup is not None:
             return movie.Weekly(soup)
         else:
             print "Not able to parse url: " + url_or_id
             pass
     elif url_or_id in self.movie_urls.keys():
         url = self.BOMURL + "/?page=weekly&id=" + url_or_id + ".htm"
         soup = utils.get_soup(url)
         if soup is not None:
             return movie.Weekly(soup)
         else:
             print "Not able to parse url: " + url
             pass
     else:
         print "Invalid movie name or URL ", url_or_id
Example #5
0
		def get_posts(self,trait='',limit=15):
			if limit == None or limit < 1 or limit > 30: #validate limit
				limit = 15
				
			if trait == 'trending' or trait not in ['latest','must-read','discussions','jobs','companies']:
				trait = ''
				
			posts = 0
			#fetch limit posts from the trait page
			soup = get_soup('')
Example #6
0
 def get_movie_summary(self, alias):
     
     self.alias = alias
     url = self.BOMURL + "/?page=main&id=" + self.alias + ".htm"
     soup = utils.get_soup(url)
     if soup is not None:
         return movie.Movie(soup)
     else:
         print "Not able to parse url: " + url
         pass
Example #7
0
	def from_post_id(self,post_id):
		soup = get_soup(page = post_id)
		title = soup.find('h1',class_='title post-item-title').find('a').contents[0]
		url = soup.find('h1',class_='title post-item-title').find('a').get('href')
		date = soup.find('span',class_='post-item-info').contents[0].split('in')[0].strip()
		category = Category.from_soup(soup.find('span',class_='post-item-info'))
		# todo = do this with regexp
		author_id = soup.find('a',{'rel':'author'}).get('href').split('/')[-2]
		votes = soup.find('div',class_='score2').find('p').contents[0]
		# todo = comments
		comments = ''
		return Post(post_id,title,url,date,category,author_id,votes,comments)
Example #8
0
 def get(self, cik, year):
     filings = get_annual_sd_filings_from_cik(cik)
     url = filings[year]['url']
     if not url:
         return {}
     soup = get_soup(url)
     if not soup:
         return {}
     meta_dict = get_meta_info_from_soup(soup)
     meta_dict.update({
         'req_cik': cik,
         'req_year': year
     })
     docs = self._get_docs_from_soup(soup, meta_dict)
     return docs
Example #9
0
    def search_movie(self, query):
        """
        Returns the list of results of a movie search, each one
        containing the image, title and link to its imdb page
        """
        soup = get_soup(SEARCH_MOVIE_TITLE, {'q': query})
        results = []

        for item in soup.find_all(class_="findResult"):
            result = {}
            result['text'] = item.find('td', class_="result_text").text.strip()
            result['url'] = item.find('td', class_="result_text").a['href']
            result['image'] = item.find('td', class_="primary_photo").a.img['src']
            results.append(result)

        return results
Example #10
0
    def _category_search(self, category_url, query, lucky):
        """
        Returns a list of results of a category search on imdb.
        Each result consists of an dictionary containing the
        primary image, title, link to the result's imdb page 
        and its id.
        """
        results = []
        soup = get_soup(category_url, {'q': query})

        if lucky: # returns only the first search result
            first = soup.find(class_="findResult")
            results.append(self._search_title_results_parser(first))
        else:
            for item in soup.find_all(class_="findResult"):
                results.append(self._search_title_results_parser(item))

        return results
def _parse_urls_from_page(base_url, page):

    url_patterns = ('a[href^=https://news.naver.com/main/read.nhn?]',
            'a[href^=https://entertain.naver.com/main/read.nhn?]',
            'a[href^=https://sports.news.naver.com/sports/index.nhn?]',
            'a[href^=https://news.naver.com/sports/index.nhn?]')

    urls_in_page = set()
    page_url = '{}&start={}&refresh_start=0'.format(base_url, 1 + 10*(page-1))
    soup = get_soup(page_url)
    if not soup:
        return urls_in_page
    try:
        article_blocks = soup.select('ul[class=type01]')[0]
        for pattern in url_patterns:
            article_urls = [link['href'] for link in article_blocks.select(pattern)]
            urls_in_page.update(article_urls)
    except Exception as e:
        raise ValueError('Failed to extract urls from page %s' % str(e))

    return urls_in_page
def save_and_upload_language(language_link, num_workers=1):
    """
    Iterates through the RawWikiLinks data for the given language and uploads
    each file to a GCS bucket.
    
    Parameters
    ----------
    language_link : str
        Link to the files storing the RawWikiLinks data for a single language.
    num_workers : int, optional
        Number of workers to use in the ThreadPool for parallelization, defaults
        to 1.
    """
    soup = utils.get_soup(language_link)
    file_names = soup.find_all("a", href=lambda tag: tag.endswith(".csv.gz"))
    if verbose:
        print("{} files to download...".format(len(file_names)))
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        for file_name in file_names:
            download_link = language_link + file_name.get("href")
            executor.submit(save_and_upload_file, (download_link))
Example #13
0
def get_station_data(station):
    station_path = settings.SRC[0] + station['station_href']
    station_soup = get_soup(station_path)
    if station_soup:
        description = station_soup.find('div', {
            'class': 's-property-content'
        }).find('p').text
        lat = description.split('широта ')[1].replace('\n', '').split('°')[0]
        lon = description.split('долгота ')[1].replace('\n', '').split('°')[0]
        address = description.split('адресу ')[1].split(', вы')[0]
        fuel_headers = station_soup.findAll(
            'span', {'class': 'col-xs-6 col-sm-4 col-md-4 add-d-title'})
        fuel_details = station_soup.findAll(
            'span', {'class': 'col-xs-6 col-sm-8 col-md-8 add-d-entry'})
        station_fuel = {}
        for index in range(len(fuel_headers)):
            key = fuel_headers[index].text
            try:
                date_last_updated = date_translate(
                    fuel_details[index].text.strip().split(
                        'обновлено ')[1].split(' г.)')[0])
                date_last_updated = int(
                    time.mktime(
                        datetime.datetime.strptime(date_last_updated,
                                                   "%d %m %Y").timetuple()))
            except (IndexError):
                date_last_updated = None
            station_fuel[key] = dict(
                cost=fuel_details[index].text.strip().split(' ')[0],
                updated=date_last_updated)
        return dict(fuel=station_fuel,
                    href=station['station_href'],
                    name=station['station'],
                    region=station['region'],
                    city=station['city'],
                    network=station['network'],
                    address=address,
                    lat=lat,
                    lon=lon)
    return False
Example #14
0
def get_direct_video_url(gogo_url):
    soup = utils.get_soup(gogo_url)
    if not soup:
        outputs.error_info("The video doesn't exist.")
        raise SystemExit
    iframe = soup.find('iframe')
    if not iframe:
        outputs.error_info("The video doesn't exist.")
        raise SystemExit
    php_l = iframe['src']
    ajx_l = ajax_t.substitute(q=php_l.split('?')[1])
    r = requests.get(ajx_l)
    try:
        link = json.loads(r.text)['source_bk'][0]['file']
    except (IndexError, KeyError, TypeError) as e:
        outputs.error_info('Unexpected error while obtaining stream url.')
        outputs.error_info(f'ERR: {e}')
        raise SystemExit
    _, ext = os.path.splitext(link)
    if ext == '.m3u8':
        link = utils.get_m3u8_stream(link)
    return link, ext
    def _get_urls_from_breaking_news(self):
        import time

        base_url = 'http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1={}&date={}&page={}'
        yymmdd = self.year + self.month + self.date
        links_in_all_sections = set()

        for sid1 in self.sid1_list:            
            links_in_a_section = set()
            last_links = set()
            page = 1
            
            while page < 1000:
                url = base_url.format(sid1, yymmdd, page)
                soup = get_soup(url)
                links = soup.select('div[class^=list] a[href^=http]')
                links = [link.attrs.get('href', '') for link in links]
                links = {link for link in links if 'naver.com' in link and 'read.nhn?' in link}

                if last_links == links:
                    break

                links_in_a_section.update(links)
                last_links = {link for link in links}

                if self.verbose:
                    print('\rpage = {}, links = {}'.format(page, len(links_in_a_section)), flush=True, end='')

                page += 1
                if self.debug and page >= 3:
                    break
                time.sleep(SLEEP)
            
            links_in_all_sections.update(links_in_a_section)
            if self.verbose:
                print('\rsection = {}, links = {}'.format(sid1, len(links_in_a_section)))

        print('date={} has {} news'.format(yymmdd, len(links_in_all_sections)))
        return links_in_all_sections
Example #16
0
def scrap_one_article(article_data):
    url_poynter_article = article_data['uri']
    soup_poynter_article = get_soup(url_poynter_article)
    for p in soup_poynter_article.main.article.find_all('p'):
        t = p.get_text()
        if 'Explanation: ' in t:
            explanation = t.replace('Explanation: ', '')
        elif 'originated from:' in t:
            origin = t.split('originated from: ')[-1]
        elif 'Fact-checked by' in t:
            checker = t.replace('Fact-checked by: ', '')
        elif '--topinfo' in p.get('class')[-1]:
            date, country = t.split(' | ')
    for a in soup_poynter_article.main.article.find_all('a'):
        if isinstance(a, NavigableString):
            continue
        elif 'Read the Full Article' in a.get_text():
            url_source_article = a.get('href')

    article_data['extraMeta']['explanation'] = clean_text(explanation)
    article_data['extraMeta']['origin'] = clean_text(origin)
    article_data['extraMeta']['checker'] = clean_text(checker)
    article_data['extraMeta']['date'] = date
    article_data['extraMeta']['country'] = clean_text(country)
    article_data['extraMeta']['url_source'] = url_source_article
    """
    language = get_language(url_source_article)
    article_data['language'] = language
    if language == 'english':
        soup = get_soup(url_source_article)
        content = soup.main.find(id='content')
        for p in content:
            if isinstance(p, NavigableString):
                pass
            else:
                body_text = p.get_text()
    else:
        body_text = 'NONE-{}'.format(language)"""
    return article_data
Example #17
0
def get_building_info(url):
    """Get building info.

    Args:
        url (str): url.

    Returns:
        dict: results.
    """
    results = list()
    soup = get_soup(url)
    if not soup:
        logging.warning('Cannot parse %s', url)
        return list()
    items = soup.find_all('span', {'class': 'f12a6'})
    if items:
        for item in items:
            item = item.next_sibling.next_sibling
            url = 'http://bjjs.zjw.beijing.gov.cn' + item.get('href')
            house_info = get_house_info(url)
            house_info['url'] = url
            results.append(house_info)
    return results
Example #18
0
def _get_article(link):
    try:
        link = '{}{}'.format(conf.MAIN_LINK, link)
        main_soup = get_soup(link)

        news_text = main_soup.find("div", attrs={
            'class': 'tn-news-text'
        }).text.replace('\n', ' ').replace('\r', '')
        news_title = main_soup.find("h1", attrs={
            'class': 'tn-content-title'
        }).text.replace('\n', ' ').replace('\r', '')
        datetime = main_soup.find("li", attrs={
            "class": "tn-hidden@t"
        }).text.replace('\n', ' ').replace('\r', '')
        view_count = _get_view_count(link)
    except:
        news_text = 'unknown'
        news_title = 'unknown'
        view_count = 0

        datetime = dt.datetime.now()

    return news_text, news_title, datetime, view_count
Example #19
0
def get_supplier_spg_df(supplier_path=SUPPLIER_PATH, spg_path=SPG_PATH):
    """
    supplier_df and spg_df have N:M relationship. Create a join table between them. 
    :return: supplier_spg_df, has columns=['supplier_spg_id', 'supplier_id', 'spg_id']
    """
    supplier_df = pd.read_excel(supplier_path)[[
        'supplier_id', 'supplier_url', 'supplier_code'
    ]]
    supplier_df = supplier_df[pd.notnull(supplier_df['supplier_code'])]
    supplier_spg_df = pd.DataFrame(columns=['spg_url_key', 'supplier_id'])

    spg_df = pd.read_excel(spg_path)[['spg_url_key', 'spg_id']]

    for supplier_id_url in supplier_df.itertuples(index=True, name='Pandas'):
        supplier_id = getattr(supplier_id_url, 'supplier_id')
        supplier_url = getattr(supplier_id_url, 'supplier_url')

        _, supplier_soup = get_soup(supplier_url)
        all_li = supplier_soup.find('table', attrs={
            'id': 'table_arw_wrapper'
        }).find_all('li')
        temp_spg_list = [li.find('a')['href'].split('/')[-2] for li in all_li]
        temp_spg_df = pd.DataFrame(temp_spg_list, columns=['spg_url_key'])
        temp_spg_df['supplier_id'] = supplier_id
        supplier_spg_df = supplier_spg_df.append(temp_spg_df)

        time.sleep(0.1)

    supplier_spg_df = supplier_spg_df.merge(spg_df,
                                            on='spg_url_key',
                                            how='left')
    supplier_spg_df.drop(columns=['spg_url_key'])

    supplier_spg_df.index += 1
    supplier_spg_df['supplier_spg_id'] = supplier_spg_df.index

    return supplier_spg_df
Example #20
0
def get_author_papers_cluster_id(author_google_id):
    papers_cluster_ids = set()
    MAX_PAGES = 5
    for page in range(1, MAX_PAGES + 1):
        try:
            logger.debug(
                "Get author page #{} with papers from indexes [{}:{}].".format(
                    page, (page - 1) * 100 + 1, page * 100 + 1))
            url = _FULLURL.format(
                _HOST,
                _AUTHOR_PAPERS_PAGE.format(author_google_id,
                                           (page - 1) * 100 + 1))
            soup = utils.get_soup(url)
            if soup is None:
                logger.debug(
                    "Soup for author page URL='{0}' is None.".format(url))
                return None
            page_papers_counter = soup.find('span', id='gsc_a_nn')
            logger.debug("Papers counter on page: {}".format(
                "not found. It's last page."
                if not page_papers_counter else page_papers_counter.text))
            for paper_info in soup.find_all("a", "gsc_a_ac gs_ibl"):
                href = paper_info["href"]
                if href:
                    id = re.findall(r'\d+', href.strip())
                    if id: papers_cluster_ids.add(id[0])
            if not page_papers_counter or int(
                    page_papers_counter.text.split("–")[1]) < page * 100 + 1:
                break
        except KeyboardInterrupt:
            raise
        except BaseException:
            logger.warn(traceback.format_exc())
        logger.debug("Found paprs with cluster id: {}".format(
            len(papers_cluster_ids)))
    return papers_cluster_ids
Example #21
0
def get_pg_df(product_index_url=PRODUCT_INDEX_URL):
    """
    Parse PRODUCT_INDEX_URL page and build a DataFrame of product group data. 
    :return: pg_df, product group DataFrame, has columns=['product_group', 'pg_url', 'pg_url_key', 'pg_id]
    """
    _, product_index_soup = get_soup(product_index_url)
    pg_regex = re.compile('catfiltertopitem.*')
    pg_h2 = product_index_soup.find_all('h2', attrs={'class': pg_regex})
    pg_list = []

    for h2 in pg_h2:
        anchor = h2.find('a')
        product_group = anchor.text.replace('/', '_')
        pg_url = DIGIKEY_HOME_PAGE + anchor['href']
        pg_url_key = anchor['href'].split('/')[-2]

        pg_list.append([product_group, pg_url, pg_url_key])

    pg_df = pd.DataFrame(pg_list,
                         columns=['product_group', 'pg_url', 'pg_url_key'])
    pg_df.index += 1
    pg_df['pg_id'] = pg_df.index

    return pg_df
Example #22
0
def _get_link(PAGE_NUM):
    PAGE_LINK = "https://tengrinews.kz/news/page/{}/".format(PAGE_NUM)
    soup = get_soup(PAGE_LINK)
    link_raw = soup.findAll("a", attrs={"class", "tn-link"})
    links = [i["href"] for i in link_raw]
    return links
Example #23
0
height, width = 256, 256
N_MAX_IMAGES = 50


for folder_name in os.listdir('_data/ingredients'):

    print('Looking for pictures of ', folder_name)
    query = folder_name.split()
    query = '+'.join(query)
    url = 'https://www.google.co.in/search?q=' + query + '&source=lnms&tbm=isch'
    print(url)

    # add the directory for your image here
    ROOT_DIRECTORY = '_data_pictures'
    soup = get_soup(url)

    actual_images = []# contains the link for Large original images, type of image
    for a in soup.find_all("div", {"class":"rg_meta"}):
        link, Type = json.loads(a.text)["ou"], json.loads(a.text)["ity"]
        actual_images.append((link, Type))

    print('There is a total of ' , len(actual_images), 'images')

    query_directory = os.path.join(ROOT_DIRECTORY, folder_name)
    if not os.path.exists(query_directory):
        os.makedirs(query_directory)

    for i, (img_url, Type) in enumerate(actual_images):

        if i > N_MAX_IMAGES: break
Example #24
0
import bs4
import json
import requests
import time

from utils import (get_content, get_soup, save_json, load_json)

MANGA_SEARCH_URL = 'https://myanimelist.net/manga.php?type=1&q='

# load series information
all_series = load_json("data.json")

for series in all_series:
    # search on MyAnimeList
    query_soup = get_soup(get_content(MANGA_SEARCH_URL + series['name']))
    time.sleep(15) # rate limiting

    table_row_tag = query_soup.find('div', class_='js-categories-seasonal').tr.next_sibling
    link_tag = table_row_tag.find('a', class_='hoverinfo_trigger fw-b')

    # series name in english
    name_en = link_tag.strong.text
    print(f'{series["name"]} | {name_en}')

    # parse series page
    info_url = link_tag['href']
    info_soup = get_soup(get_content(info_url))
    time.sleep(15) # rate limiting

    container = info_soup.find('div', class_='js-scrollfix-bottom')
Example #25
0
 def get(self, ticker_id):
     recent_8k = self.get_the_most_recent_8k(ticker_id)
     soup = get_soup(recent_8k['link'])
     info = get_meta_info_from_soup(soup)
     return info
Example #26
0
def shonenjump():
    # create image directory
    IMG_DIR = 'img'
    create_dir(IMG_DIR)

    rensai_soup = get_soup(get_content(RENSAI_URL))
    archives_soup = get_soup(get_content(ARCHIVES_URL))

    # store series information: name, abbreviated name and whether it is still ongoing
    all_series = []

    # create icon directory
    ICONS_DIR = os.path.join(IMG_DIR, 'icons')
    create_dir(ICONS_DIR)
    
    for soup in [rensai_soup, archives_soup]:
        # ongoing series?
        ongoing = True if soup is rensai_soup else False

        section = soup.find('section', class_='serialSeries')

        for li in section.find_all('li'):
            # series name in japanese
            name_jp = li.div.text if li.div else li.p.text
            name_jp = name_jp[1:name_jp.find('』')]
            
            link_tag = li.a

            # abbreviated name
            abbr = link_tag['href'].rsplit('/', 1)[1][:-5]

            # download icon
            img_src = link_tag.img['src']
            img_url = BASE_URL + img_src
            file_path = os.path.join(ICONS_DIR, abbr + '.' + img_src.rsplit('.', 1)[1])
            print(f'Downloading {file_path}...')
            write_image(img_url, file_path)
            
            # add series
            series = { 'name': name_jp, 'abbr': abbr, 'ongoing': ongoing }
            all_series.append(series)

    # save series information
    save_json("data.json", all_series)

    for series in all_series:
        # create directory for this series
        series_dir = os.path.join(IMG_DIR, series['abbr'])
        create_dir(series_dir)
            
        current_list_url = LIST_URL + series['abbr'] + '.html'

        while current_list_url:
            list_soup = get_soup(get_content(current_list_url))
            ul = list_soup.find('ul', class_='comicsList')
            
            # ignore series that hasn't release any volume yet
            if ul.li is None:
                break
            
            for dl in ul.select('li dl'):
                # skip current volume if it isn't released yet
                if '発売予定' in str(dl.p):
                    continue

                # download cover
                img_src = dl.img['src']
                img_url = BASE_URL + img_src
                file_path = os.path.join(series_dir, img_src.rsplit('/', 1)[1])
                print(f'Downloading {file_path}...')
                write_image(img_url, file_path)

            # get url for next list of covers
            next_list_url_tag = list_soup.find('span', class_='current_page').next_sibling.next_sibling
            if next_list_url_tag is None:
                break
            else:
                current_list_url = BASE_URL + next_list_url_tag['href']
Example #27
0
 def __init__(self, cislo):
     self.cislo = cislo
     self.url = ("https://www.nrsr.sk/web/Default.aspx?sid=zakony/zakon"
                 "&ZakZborID=13&CisObdobia=7&CPT={}".format(self.cislo))
     self.soup = utils.get_soup(self.url)
     self.data = {}
Example #28
0
 def __init__(self):
     """Initialize the base url and get the base soup."""
     self.base_url = "http://www.demagog.sk/politici"
     self.base_soup = utils.get_soup(self.base_url)
Example #29
0
import datetime

import pandas as pd
from sqlalchemy import create_engine

from utils import get_date_list, get_soup, extract_soup_data, process_dataframes, clean_and_format_dataframe
from playlist_types import playlist_types

basdir = os.path.abspath(os.path.dirname(__file__))
engine = create_engine(os.environ.get('DATABASE_URL') or f'sqlite:///{os.path.join(basdir, "app.sqlite")}')

dates = get_date_list()
query_date = '2019/08/03'

for query_date in dates:
    soup = get_soup(query_date)

    tables_html, tables_dfs, accordions = extract_soup_data(soup)

    tables_dfs = process_dataframes(tables_html, tables_dfs, accordions)

    # make one df from all the shows
    df = pd.concat(tables_dfs, ignore_index=True)

    del df['Buy CD']
    df.rename(columns={'Unnamed: 0': 'time',
                       'Title': 'title',
                       'Composer': 'composer',
                       'Performers': 'performers',
                       'Record Co.Catalog No.': 'record_co_catalog_no',
                       }, inplace=True)
Example #30
0
 def __init__(self, cislo):
     self.cislo = cislo
     self.url = ("https://www.nrsr.sk/web/Default.aspx?"
                 "sid=schodze/nrepdn_detail&id={}".format(self.cislo))
     self.soup = utils.get_soup(self.url)
Example #31
0
def _cluster_handler(cluster_id, papers_count):
    logger.debug("Handle %i papers from cluster %s." %
                 (papers_count, cluster_id))
    url = _FULLURL.format(_HOST, _SCHOLARCLUSTER.format(cluster_id))
    logger.debug("Get cluster page URL='{0}'.".format(url))
    soup = utils.get_soup(url)
    #utils.soup2file(soup, "D:\A.html")
    # This dictionary contains info about unique papers
    EndNote_list = list()
    file_counter = 0
    merged_counter = 0

    # return true if EndNote_1 equal EndNote_2
    def is_EndNote_equal(EndNote_1, EndNote_2):        return \
EndNote_1["title"].lower() == EndNote_2["title"].lower() and \
(
            not "year" in EndNote_1 or not "year" in EndNote_2
            or EndNote_1["year"] == EndNote_2["year"]
        ) \
and len(EndNote_1["author"]) == len(EndNote_1["author"]) \
and EndNote_1["type"] == EndNote_2["type"] and \
(
            not "pages" in EndNote_1 or not "pages" in EndNote_2
            or EndNote_1["pages"] == EndNote_2["pages"]
        )

    # return list of similar papers (maybe empty)
    def intersect_papers(EndNote_data, EndNote_list):        return \
[i for i in EndNote_list if is_EndNote_equal(EndNote_data, i)]

    # Loop on pages
    while True:
        if soup is None:
            logger.debug(
                "Soup for cluster page URL='{0}' is None.".format(url))
            return None
        # This list contains links to EndNote and cited by count for each paper
        # in cluster
        logger.debug("Find EndNote links for each paper in cluster.")
        footer_links = [{
            "EndNote" if "EndNote" in link.text else "citedby":
            link["href"].strip() if "EndNote" in link.text else int(
                re.findall(r'\d+', link.text)[0])
            for link in paper_block.find("div", class_="gs_fl").find_all('a')
            if "EndNote" in link.text or "Cited" in link.text
            or "Цитируется" in link.text
        } for paper_block in soup.find_all('div', class_='gs_ri')]
        logger.debug(
            "Extract unique papers in cluster and load data from EndNote.")
        for links in footer_links:
            if links != {}:
                file_counter += 1
                logger.debug("EndNote file #%i (total %i)" %
                             (file_counter, papers_count))
                if links.get("EndNote"):
                    paper_EndNote_data = get_info_from_EndNote(
                        links["EndNote"], True)
                else:
                    settings.print_message(
                        'Error getting EndNote files. '
                        'Please change the display settings Google Scholar in English '
                        '(https://scholar.google.com/).')
                    logger.debug(
                        'End work programme because did not find link to EndNote file.'
                    )
                    raise Exception('Did not find EndNote.')
                if paper_EndNote_data is None:
                    logger.debug(
                        "Skip EndNote file #%i, could not upload file." %
                        file_counter)
                    continue
                if not "year" in paper_EndNote_data or not "author" in paper_EndNote_data:
                    logger.debug(
                        "Skip EndNote file #%i, empty year or authors fields."
                        % file_counter)
                else:
                    similar_papers = intersect_papers(paper_EndNote_data,
                                                      EndNote_list)
                    if similar_papers == []:
                        merged_counter += 1
                        logger.debug(
                            "EndNote file #%i miss all EndNote files in merged array."
                            % file_counter)
                        logger.debug("Add EndNote file #%i in merged array." %
                                     file_counter)
                        paper_EndNote_data.update({
                            "url_scholarbib":
                            links["EndNote"],
                            "citedby":
                            links["citedby"] if "citedby" in links else None
                        })
                        EndNote_list.append(paper_EndNote_data)
                    else:
                        similar_file = similar_papers[0]
                        similar_file_index = EndNote_list.index(similar_file)
                        if len(similar_file) < len(paper_EndNote_data):
                            logger.debug(
                                "EndNote file #{0} like #{1} EndNote file in merged array and has more fields, replace."
                                .format(file_counter, similar_file_index + 1))
                            EndNote_list[
                                similar_file_index] = paper_EndNote_data
                        else:
                            logger.debug(
                                "EndNote file #{0} like #{1} EndNote file in merged array, skipped."
                                .format(file_counter, similar_file_index + 1))
        # NEXT button on html page
        if soup.find(class_='gs_ico gs_ico_nav_next'):
            url = soup.find(
                class_='gs_ico gs_ico_nav_next').parent['href'].strip()
            logger.debug("Load next page in resulting query selection.")
            soup = utils.get_soup(_FULLURL.format(_HOST, url))
        else:
            break
    if merged_counter == 0:
        logger.debug(
            "All %i EndNote files in the cluster are not informative. No merged files."
            % file_counter)
    else:
        logger.debug(
            "All {0} EndNote files merged in {1} (i.e. distinct versions in cluster: {1}):"
            .format(file_counter, merged_counter))
        for counter, data in enumerate(EndNote_list):
            logger.debug("Merged EndNote file #%i:\n%s" %
                         (counter + 1, data["EndNote"]))
    return tuple(EndNote_list)
Example #32
0
def get_soup_of_with_h_and_gtyp(h, gtyp):
    url = filter_url_base.format(h, gtyp)
    return get_soup(url)
Example #33
0
def parse_day_time(url):
    soup = get_soup(url)
    day_time = soup.find('em', {'class': 'date'}).text
    day = day_time.split()[0].replace('.', '-')
    time = day_time.split()[1]
    return day, time
    def download_tab(self, url, path=''):
        r1 = requests.get(url, headers=self._headers)
        soup = get_soup(r1)

        text = soup.find('pre').text
        self._save_file(text, url, '.txt', path=path, encode='utf-8')
Example #35
0
	def get_first_thread_html(self):
		comments_url = self.get_first_thread_link()
		parsed_html = get_soup(comments_url)
		html_string = str(parsed_html)
		return html_string
Example #36
0
    #call progress bar
    #for item in utils.progressBar(url_dict.keys(), prefix='Category',suffix = 'Complete', length = 50):

    for category in url_dict.keys():
        #read in links data
        link_name_str = f'{category}+"links.txt"'

        if link_name_str in os.listdir(f'./{category}'):
            with open(f'{os.getcwd()}+"/link_name_str"', "r+") as file:
                scraped_link_list = file.readlines()
        else:
            scraped_link_list = []

        # get main page soup
        main_page_soup = utils.get_soup(url_dict[category])

        # get number of pages
        page_count = utils.get_page_count(main_page_soup)[1]

        # iterate through pages and scrape links
        links = []

        for page in range(1, page_count):

            # get links from page
            print(url_dict[category] + "?page-" + str(page))

            soup = utils.get_soup(url_dict[category], "?page-" + str(page))
            links = links + utils.get_page_links(soup)
Example #37
0
	def get_first_thread_link(self):
		parsed_html = get_soup(self.url)
		comments_element = parsed_html.find('a', {"class": "comments"})
		comments_url = comments_element['href']
		return comments_url
Example #38
0
def _ident_and_fill_paper(soup, params):
    """Return paper info"""
    pagenum = 1
    papers_count = 0
    qtext = requests.utils.quote(
        stopwords.delete_stopwords(params["title"], " and "))
    # DEBUG messages
    #logger.debug("Proceed stop word list for title '%s'" % params["title"])
    #logger.debug("Title without stop words: '%s'" % stopwords.delete_stopwords(params["title"], " "))
    #logger.debug("Title with logical conditions: '%s'" % stopwords.delete_stopwords(params["title"], " and "))
    ##
    while True:
        logger.debug("Find papers on page #%i (max_researchgate_papers=%i)" %
                     (pagenum, params["max_researchgate_papers"]))
        if soup.find('div', class_='search-noresults-headline') is not None:
            logger.debug("This paper not found in researchgate.")
            return None
        logger.debug("Parse html and get info about papers.")
        papers_box = soup.find_all('div', 'publication-item')
        logger.debug("On resulting page #%i found %i papers." %
                     (pagenum, len(papers_box)))
        on_page_paper_count = 0
        for papers_item in papers_box:
            if papers_count > params["max_researchgate_papers"]:
                logger.debug("This paper not found in researchgate.")
                return None
            try:
                on_page_paper_count += 1
                papers_count += 1
                # Get info about paper
                authors = len(papers_item.find_all("span", itemprop="name"))
                year = int(
                    papers_item.find('div',
                                     class_='publication-metadata').find(
                                         'span').text.split()[1])
                title = papers_item.find(
                    "a", class_="publication-title").text.strip().lower()
                type = papers_item.find(
                    'div', class_='publication-type').text.strip().lower()
                logger.debug(
                    "Process paper #%i (title='%s'; year=%i; auth_count=%i; type='%s')"
                    % (papers_count, title, year, authors, type))
                logger.debug("Title and year check.")
                # First compare
                if params["year"] != year:
                    logger.debug(
                        "Year of paper #%i does not coincide with the year of the required paper, skipped."
                        % (on_page_paper_count))
                elif params["title"] != title:
                    logger.debug(
                        "Title of paper #%i does not coincide with the title of the required paper, skipped."
                        % (on_page_paper_count))
                # Second compare
                else:
                    logger.debug(
                        "The title and year of the paper coincided, identification of information from the RIS."
                    )
                    timeout = random.uniform(0, 3)
                    logger.debug("Sleep {0} seconds.".format(timeout))
                    time.sleep(timeout)
                    paper_url = _FULLURL.format(
                        _HOST,
                        papers_item.find("a",
                                         class_="publication-title")["href"])
                    logger.debug("Process RIS for paper #%i." %
                                 on_page_paper_count)
                    rg_paper_id = get_rg_paper_id_from_url(paper_url)
                    info = get_info_from_RIS(rg_paper_id)
                    if params["authors_count"] != len(info['authors']):
                        logger.debug(
                            "Count of author of paper #%i does not coincide with the count of author of the required paper, skipped."
                            % (on_page_paper_count))
                    elif 'start_page' in info and params[
                            "spage"] is not None and str(
                                params["spage"]) != info['start_page']:
                        logger.debug(
                            "Start page of paper #%i does not coincide with the start page of the required paper, skipped."
                            % (on_page_paper_count))
                    elif 'end_page' in info and params[
                            "epage"] is not None and str(
                                params["epage"]) != info['end_page']:
                        logger.debug(
                            "End page of paper #%i does not coincide with the end page of the required paper, skipped."
                            % (on_page_paper_count))
                    else:
                        logger.debug(
                            "Paper #%i was identified with EndNote file #%i." %
                            (on_page_paper_count, params["paper_version"]))
                        logger.debug(
                            "EndNote file #%i:\n%s" %
                            (params["paper_version"], params["EndNote"]))
                        logger.debug("RIS file:\n%s" % info["RIS"])
                        paper_url = _FULLURL.format(
                            _HOST,
                            papers_item.find(
                                "a", class_="publication-title")["href"])
                        type = papers_item.find(
                            'div',
                            class_='publication-type').text.strip().lower()
                        info = get_paper_info_from_dataRIS(info, rg_paper_id)
                        info.update({
                            "rg_type": type,
                            "url": paper_url,
                        })
                        # Get authors
                        #logger.debug("Get authors list")
                        #auth_list = get_authors(info["rg_id"])
                        # Get author info
                        # for author in auth_list:
                        #    if author["accountId"] != None:
                        #        logger.debug("Get more info for author with rg_account_id={0}".format(author["accountId"]))
                        #        author_info = get_auth_info(author["accountId"])
                        #        author.update(author_info)
                        #info.update({"authors" : auth_list})
                        return info
            except Exception as error:
                logger.warn(traceback.format_exc())
        if len(papers_box) >= 10:
            pagenum += 1
            logger.debug("Load next page in resulting query selection.")
            # Delay about Delay seconds for hide 429 error.
            timeout = random.uniform(1, 2)
            logger.debug("Sleep {0} seconds.".format(timeout))
            time.sleep(timeout)
            qtext = requests.utils.quote(
                stopwords.delete_stopwords(params["title"], " and "))
            #   DEBUG messages
            logger.debug("Proceed stop word list for title '%s'." %
                         params["title"])
            logger.debug("Title without stop words: '%s'." %
                         stopwords.delete_stopwords(params["title"], " "))
            logger.debug("Title with logical conditions: '%s'." %
                         stopwords.delete_stopwords(params["title"], " and "))
            #
            url = _PUBSEARCH.format(qtext, pagenum)
            soup = utils.get_soup(_FULLURL.format(_HOST, url),
                                  _PROXY_OBJ.get_cur_proxy())
        else:
            logger.debug("This paper not found in researchgate.")
            return None
Example #39
0
        LISTING_IDS = get_existing_ids(EXISTING_DATA)
        CURRENT_IDS = LISTING_IDS.copy() if skip_ids else []
        REMOVED_IDS = []

        # array of objects
        MAIL_PRICE_CHANGES = []
        MAIL_NEW_LISTINGS = []
        MAIL_REMOVED_LISTINGS = []

        while True:
            logger.info(f'--------------------------{CURRENT_PAGE}--------------------------')

            if TOTAL_PROFILES_COUNT and (CURRENT_PROFILES_COUNT >= TOTAL_PROFILES_COUNT):
                break

            soup = get_soup(SEARCH_URL.format(CURRENT_PAGE))
            if soup:
                if not TOTAL_PROFILES_COUNT:
                    TOTAL_PROFILES_COUNT = get_total_profiles(soup)

                profiles = get_profiles(soup)
                if profiles:
                    CURRENT_PROFILES_COUNT = CURRENT_PROFILES_COUNT + len(profiles)
                    for url in profiles:
                        _id = get_id(url)
                        if _id in CURRENT_IDS:
                            continue

                        time_gap()

                        p_soup, profile = get_profile_data(url)
Example #40
0
def get_community_by_region(city='bj',
                            region_name='chaoyang',
                            output_dir=None):
    """Get community by region.

    Args:
        city: str, city.
        region_name: str, region name.
    """
    url = 'https://{}.ke.com/xiaoqu/{}/'.format(city, region_name)
    soup = get_soup(url)
    if not soup:
        return
    total_pages = get_total_pages(url)
    if not total_pages:
        logging.error('Finish at %s', model.Community.select().count())
        return
    for page in range(total_pages):
        if page > 0:
            url_page = '{}pg{}/'.format(url, page)
            soup = get_soup(url_page)
            if not soup:
                return
        name_list = soup.find_all('li', {'class': 'clear'})
        logging.info('%s %d / %d', region_name, page + 1, total_pages)
        for name in name_list:
            info = dict()
            try:
                item = name.find('div', {'class': 'title'})
                title = item.get_text().strip('\n')
                logging.info('%s', title)
                link = item.a.get('href')
                info['title'] = title
                link = get_mobile_link(link, city)
                info['link'] = link
                info['community_id'] = name.get('data-housecode')
                item = name.find('a', {'class': 'district'})
                if item:
                    info['district'] = item.get_text()
                item = name.find('a', {'class': 'bizcircle'})
                if item:
                    info['bizcircle'] = item.get_text()
                item = name.find('div', {'class': 'tagList'})
                if item:
                    info['tags'] = item.get_text().strip('\n')
                item = name.find('a', {'class': 'totalSellCount'})
                if item:
                    info['onsale'] = item.span.get_text().strip('\n')
                item = name.find('a', {'title': title + u'租房'})
                if item:
                    info['onrent'] = item.get_text().strip('\n').split(u'套')[0]
                item = name.find('div', {'class': 'totalPrice'})
                if item:
                    info['average_unit_price'] = item.span.get_text().strip(
                        '\n')
                output_path = os.path.join(output_dir,
                                           u'{}.json'.format(title))
                community_info = get_community_info_by_url(link, output_path)
                for key, value in community_info.items():
                    info[key] = value
            except Exception as expection:
                logging.error(expection)
                logging.error(traceback.format_exc())
                continue
            model.Community.replace(**info).execute()
        time.sleep(1)
Example #41
0
def get_community_info_by_url(url, output_path=None):
    """Get community info by url.

    Args:
        url: str, url.

    Returns:
        dict, results.
    """
    results = dict()
    soup = get_soup(url)
    if not soup:
        logging.warn('Cannot parse %s', url)
        return results
    pattern = re.compile('window.__PRELOADED_STATE__ = (.*);',
                         re.MULTILINE | re.DOTALL)
    items = soup.find('script', {'type': 'text/javascript'}, text=pattern)
    if items:
        items = pattern.search(items.text).group(1)
        items = json.loads(items)
        if output_path:
            json.dump(items,
                      codecs.open(output_path, 'w', 'utf-8'),
                      indent=4,
                      sort_keys=True,
                      ensure_ascii=False)
        items = items['xiaoquDetail']['survey']
        if isinstance(items, dict):
            items = items.values()
        for item in items:
            name = item['name']
            name = model.Community.NAME_DICT[name]
            value = unicode(item['value'])
            results[name] = value
    # gonglueV2.html
    url += 'gonglueV2.html'
    soup = get_soup(url)
    if not soup:
        logging.warn('Cannot parse %s', url)
        return results
    items = soup.find_all('span', {'class': 'txt_gray'})
    if items:
        for item in items:
            name = item.get_text().strip(u':')
            name = model.Community.NAME_DICT[name]
            if name in results:
                continue
            value = unicode(item.next_sibling)
            results[name] = value
    # intro
    item = soup.find('div', {'class': 'cpt_content_section'})
    if item:
        results['intro'] = item.get_text().strip('\n')
    # score
    item = soup.find('div', {'class': 'review_score'})
    if item:
        value = float(item.next_element)
        results['score'] = value
    item = soup.find('ul', {'class': 'review_list'})
    if item:
        items = item.find_all('li')
        if items:
            for item in items:
                name = item.get_text().replace('\n', '').split()
                value = float(name[1].strip(u'分'))
                name = model.Community.NAME_DICT[name[0]]
                if name in results:
                    continue
                results[name] = value
    # review
    item = soup.find('div', {'id': 'review_good'})
    if item:
        value = item.get_text().strip('\n').strip(u'小区优点').strip('\n')
        results['good_point'] = value
    item = soup.find('div', {'id': 'review_bad'})
    if item:
        value = item.get_text().strip('\n').strip(u'小区弱点').strip('\n')
        results['bad_point'] = value
    # sheshi_cell
    items = soup.find_all('div', {'class': 'sheshi_cell'})
    if items:
        for item in items:
            name = item.p.get_text()
            name = model.Community.NAME_DICT[name]
            value = item.img['src'].split('/')[-1].split('.')[0].split('_')[-1]
            value = value[0] != 'n'
            results[name] = value
    return results