Example #1
0
def get_player_stats():
    player_stats = []
    composition_stats = []
    player_hero_stats = []
    session = HTMLSession()
    res = session.get(player_stats_url())
    map_divs = res.html.find('.map-wrapper')
    table_divs = res.html.find('.side-by-side-stats')
    category = 'Allmaps'
    for div in res.html.find('.match-div > div'):
        if 'map-wrapper' in div.attrs.get('class', []):
            map_name = div.find(
                '.label-info', first=True).text.lower().replace(' ', '_')
        elif 'side-by-side-stats' in div.attrs.get('class', []):
            composition_stat, hero_stat = parse_overall_hero_stat_div(
                div, category=category, map_name=map_name)
            composition_stats += composition_stat
            player_hero_stats += hero_stat
            player_stats += parse_overall_stat_div(
                div, category=category, map_name=map_name)
        else:
            category = div.text
    write_json('stats/composition_stats.json', composition_stats)
    write_json('stats/player_hero_stats.json', player_hero_stats)
    write_json('stats/player_stats.json', player_stats)
def get_media(user):
    ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    headers = {
        'User-Agent': 'My User Agent 1.0',
    }

    proxies = {
      'http': 'http://1.20.102.177:30106',
      'https': 'https://1.20.102.177:30106',
    }   
    url = 'https://www.instagram.com/' + user
    session = HTMLSession()
    req = session.get(url, headers=headers, proxies=proxies)
    
    media = []
    scripts = req.html.xpath('//script[@type]')    
    for s in scripts:
        content = s.text
        if "csrf_token" in content:
            content = content[:-1].split("window._sharedData = ")[1]      
            data = json.loads(content)     
            recent_media = data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
            for r in recent_media:
                media.append({
                    "username": data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["username"],
                    "image": r["node"]["thumbnail_src"],
                    "timestamp": r["node"]["taken_at_timestamp"],
                    'permalink': r["node"]["display_url"],
                    'caption': r["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"],
                    'shortcode': r["node"]["shortcode"]
                })
    return media
Example #3
0
def main(dir: str):

    files = [x for x in os.listdir(dir) if x.lower()[-4:] == "json"]
    session = HTMLSession()
    os.makedirs(f"{dir}_solutions", exist_ok=True)
    for crossword_file in files:

        crossword = json.load(open(os.path.join(dir, crossword_file)))

        timestamp = crossword["dateSolutionAvailable"]
        year, month, day = parse_timestamp(timestamp)
        number = crossword["number"]
        crossword_type = crossword["crosswordType"]

        url = f"https://www.theguardian.com/crosswords/{year}/{month}/{day}/annotated-solutions-for-{crossword_type}-{number}"
        print(crossword["solutionAvailable"], url)
        result = session.get(url)
        if result.status_code >= 300:
            continue
        html = result.html
        relevant_divs = html.find("div.content__main-column.content__main-column--article.js-content-main-column")
        if len(relevant_divs) != 1:
            print(relevant_divs)

        solutions = [x.text for x in relevant_divs[0].find("p") if x.text]

        parsed = parse_solutions(solutions)
        save_name = os.path.join("crosswords/prize_solutions", f"{number}_solution.json")

        with open(save_name, "w+") as file:
            json.dump(parsed, file, indent=4)
Example #4
0
def show_datetime_for(name, url):
    session = HTMLSession()
    r = session.get(url)
    # '03:37:58'
    time = r.html.find('#ct', first=True).text
    # 'PDT'
    timezone = r.html.find('#cta', first=True).text
    # 'Saturday, 16 June 2018'
    date = r.html.find('#ctdat', first=True).text

    print(f'{name:12}: {time} {date} {timezone}')
Example #5
0
class MensaBase(object):

    def __init__(self, endpoints, location):
        """Constructor."""
        self.location = location
        # dict of language specific endpoints
        # { Language : url-string }
        self.endpoints = endpoints

        adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1))
        self.session = HTMLSession()
        self.session.mount('https://', adapter)

    def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan:
        # overwrite this
        # TODO how to make design more pythonic?
        # In Java terms: abstract class -> two implementation classes
        pass

    # Helper method to make a language-specific request
    def do_request(self, language=Language.DE):
        resp = self.session.get(self.endpoints[language.name])
        code = resp.status_code
        if code != 200:
            logger.warning(f'Non-200 status: {code}')
        logger.debug(f'Status Code: {code}')
        return resp.html

    @staticmethod
    def _normalize_key(k: str) -> str:
        return None if not k else k.strip().lower().replace(' ', '_')

    @staticmethod
    def _strip_additives(text: str) -> str:
        return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text)

    @staticmethod
    def _normalize_whitespace(text: str) -> str:
        return re.sub('\s{2,}', ' ', text)

    @staticmethod
    def _normalize_orthography(text: str) -> str:
        return re.sub('\s,', ',', text)

    @staticmethod
    def _clean_text(text: str) -> str:
        return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip())))

    @staticmethod
    def _text_replace(text: str) -> str:
        return re.sub('Züricher', "Zürcher", text)
Example #6
0
def get_teams_and_matches():
    session = HTMLSession()
    res = session.get(owl_index_url())
    res.html.render(timeout=60)
    match_rows = res.html.find(
        '.tab-pane#past')[0].find('table')[0].find('.past-matches-row')
    updated = True
    # TODO get match data in future
    # for row in match_rows:
    #     if parse_match_row(row):
    #         updated = True
    if updated:
        teams = {td.text: td.absolute_links.pop()
                 for td in res.html.find('td.team')}
        write_json('stats/team_hero_stats.json',
                   [parse_team(team_name, team_url) for team_name, team_url in teams.items()])
    def handle(cls, *args, **kwargs):
        session = HTMLSession()

        for operator in Operator.objects.filter(service__current=True, twitter='').exclude(url='').distinct():
            try:
                r = session.get(operator.url, timeout=10)
            except RequestException:
                operator.url = ''
                operator.save()
                continue
            for link in r.html.links:
                twitter = cls.get_from_link(link)
                if twitter:
                    operator.twitter = twitter
                    operator.save()
                    break
Example #8
0
def fetch_hpps_streamflow(dst_dir, url=None):
    """
    Fetch streamflow data from chmi fetch_hpps_data
    """
    session = HTMLSession()
    n_charts = 0

    datatype_prefix = 'streamflow'
    agency = 'chmi'

    pagesize = 50
    n_pages = 20

    for page in range(0, n_pages):
        subpage_url = "http://hydro.chmi.cz/hpps/hpps_oplist.php?startpos={0}&recnum={1}".format(page*pagesize, pagesize)
        print("----------------------------------------------------")
        print(subpage_url)
        print("----------------------------------------------------")
        session = HTMLSession()
        r = session.get(subpage_url)
 
        for lnk in r.html.absolute_links:
            if 'prfdyn' in lnk:
                print(lnk)
                
                station_seq = lnk.split('=')[-1]
                print(station_seq)

                data_dir = dst_dir / datatype_prefix / agency / station_seq
                if not os.path.exists(data_dir):
                    os.makedirs(data_dir)
                utc_timestamp_text = datetime.utcnow().strftime('%Y-%m-%dT%H0000z.html')

                html_filename = "prfdata_" + station_seq + "_" + utc_timestamp_text
                html_path = data_dir / html_filename

                # save the HTML with seven-day table
                lnk_table = lnk.replace('prfdyn', 'prfdata')
                print(lnk_table)
                html_response = get(lnk_table)
                if html_response.status_code == 200:
                    print(html_path)
                    with open(html_path, 'wb') as f:
                        f.write(html_response.content)
Example #9
0
class MM(object):
    def __init__(self):
        self.__page = 1
        self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
        self.__session = HTMLSession()
        self.__headers = {
            'Referer':'http://www.mm131.com/qingchun/',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        self.__imagePath = r'D:/Photo/MM'
        self.__confirmPath()

    def __confirmPath(self):
        if not os.path.exists(self.__imagePath):
            os.makedirs(self.__imagePath)
            
    def download(self,link,fileName):
        try:
            with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f:
                f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content)
        except Exception as e:
            print(str(e))

    def parseData(self):
        start = time.time()
        while self.__page < 12:
            if self.__page == 1:
                self.__url = "http://www.mm131.com/qingchun/"
            else:
                self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page)
            r = self.__session.get(self.__url)
            main = r.html.find(".main",first=True)
            dl = main.find('dl')[0]
            dds = dl.find('dd')
            for dd in dds[:-1]:
                attr = dd.find('img')[0].attrs
                imageLink = attr['src']
                title = attr['alt']
                self.download(imageLink,title)
            self.__page += 1
        end = time.time() - start
        print("爬取时间:",end)
Example #10
0
def parse_match_row(row):
    match_path = os.path.join(
        'stats', 'matches', row.attrs['matchid'] + '.json')
    if os.path.exists(match_path):
        return False
    match = {}
    session = HTMLSession()
    match_res = session.get(match_url(row.attrs['matchid']))
    render_result = match_res.html.render(timeout=600)
    print(render_result)
    team_names = [{'name': team_name_div.text,
                   'id': team_name_div.links.pop().split('id=')[-1]} for team_name_div in match_res.html.find('.names-and-score', first=True).find('div')[1::2]]
    maps = []
    for map_div in match_res.html.find('.map-wrapper'):
        map_data = {'name': map_div.find(
            '.mapname', first=True).text, 'teams': []}
        mapping = {'name': 3, 'score': 4,
                   'progress': 5, 'fights': 6, 'kills': 7}
        for i in range(1, 3):
            team_data = {}
            for key, index in mapping.items():
                team_data[key] = map_div.find('div')[index].text.split('\n')[i]
            map_data['teams'].append(team_data)
        maps.append(map_data)
    stat_divs = match_res.html.find('.side-by-side-stats')
    overall_stats = parse_stat_div(stat_divs.pop(0))
    for i, map_stat_div in enumerate(stat_divs):
        maps[i]['stats'] = parse_stat_div(map_stat_div)

    hero_stats = parse_hero_stat_div(match_res.html.find(
        '#allMapsAllRoundsAllTeams', first=True))
    hero_stats_by_team = []
    # TODO FIX the script problem
    # for team in team_names:
    #     hero_stats_by_team.append(parse_hero_stat_div(match_res.html.find(
    #         '#allMapsAllRoundsTeam' + team['id'], first=True)))
    write_json(match_path, {'maps': maps, 'stats': overall_stats, 'hero_stats': hero_stats,
                            'hero_stats_by_team': hero_stats_by_team,
                            'teams': team_names, 'date': row.find('td')[0].text})
    return True
Example #11
0
def main(crossword_types: List[str]):

    session = HTMLSession()
    for crossword_type in crossword_types:

        if crossword_type not in CROSSWORDS.keys():
            raise ValueError(f"crosword type must be in one of {CROSSWORDS.keys()}")
        start, end = CROSSWORDS[crossword_type]

        os.makedirs(f"crosswords/{crossword_type}", exist_ok=True)
        for crossword_no in reversed(range(start, end)):

            try:
                url = "https://www.theguardian.com/crosswords/" + crossword_type + "/" + str(crossword_no)
                result = session.get(url)
                if result.status_code >= 300:
                    continue
                html = result.html
                try:
                    relevant_divs = html.find("div.js-crossword")
                    if len(relevant_divs) != 1:
                        print(relevant_divs)
                    clues = relevant_divs[0].attrs["data-crossword-data"]
                except:
                    relevant_divs = html.find("div.js-crossword has-grouped-clues")
                    if len(relevant_divs) != 1:
                        print(relevant_divs)
                    clues = relevant_divs[0].attrs["data-crossword-data"]

                clues_json = json.loads(clues)
                save_name = clues_json["id"] + ".json"

                with open(save_name, "w+") as file:
                    json.dump(clues_json, file, indent=4)
            except IndexError:
                print("couldn't find crossword no:{}".format(crossword_no))
                with open("crosswords/" + crossword_type + "/missing_ids.txt", "a+") as file:
                    file.write(str(crossword_no) + "\n")
Example #12
0
def get_event_player_rank():
    session = HTMLSession()
    res = session.get(player_rank_url())
    table = res.html.find('table.ranking-table', first=True)
    player_ranks = []
    hero_ranks = []
    for tr in table.find('tr')[2:]:

        overall_rank = int(tr.find('td.rank', first=True).text)
        overall_rating = int(tr.find('.rating-number', first=True).text)
        team_name = tr.find('.small-team-logo',
                            first=True).attrs['title'].split(': ')[-1]
        stars = int(tr.find(
            '.star-rating', first=True).attrs['class'][-1].replace('star', '').split('-')[0])
        info_div, heros_div = tr.find('.team-info-td > div')
        name = info_div.find('a', first=True).text
        time, sos_rank, win_percent = [div.text.split(
            ': ')[-1] for div in info_div.find('.secondary-stats')]
        rank_data = {'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name,
                     'stars': stars, 'name': name, 'time': time, 'sos_rank': int(sos_rank),
                     'win_percent': win_percent, 'hero_ranks': []}
        for span in heros_div.find('span.secondary-ranking'):
            hero_name = span.attrs['title'].split(' Rank:')[0].lower()
            hero_rank_by_total, hero_rating, hero_time, hero_win_percent = [
                text.split(': ')[-1] for text in span.attrs['title'].split('\n')]
            hero_rank, total_count = hero_rank_by_total.split('/')
            hero_rank_data = {'hero_name': hero_name, 'hero_rank_by_total': hero_rank_by_total,
                              'hero_rating': int(hero_rating), 'hero_time': hero_time,
                              'hero_win_percent': hero_win_percent, 'hero_rank': int(hero_rank),
                              'total_count': int(total_count), 'name': name, 'overall_rank': overall_rank,
                              'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, }
            hero_ranks.append(hero_rank_data)
            rank_data['hero_ranks'].append(hero_rank_data)
        player_ranks.append(rank_data)
    write_json('stats/player_ranks.json', player_ranks)
    write_json('stats/hero_ranks.json', hero_ranks)
Example #13
0
import sys
import json
from requests_html import HTMLSession

GOOGLE = 'https://www.google.com/search?tbm=isch&q='

if len(sys.argv) == 2:

    q = str(sys.argv[1])

    session = HTMLSession()
    r = session.get(GOOGLE + q)
    r.html.render()
    colors = r.html.search_all("rgb({rgb})")

    palette = []
    for c in colors:
        palette.append(c['rgb'])
    wordPalette = {q: palette}

    print(json.dumps(wordPalette))
Example #14
0
def fetch_vodagov_charts(dst_dir, agency, base_url, subpages, datatype_prefix):
    """
    Fetch graphs and html tables from voda.gov.cz
    fetch_vodagov_charts(dst_dir='/home/jiri/meteodata',
                         agency_prefix='pod',
                         base_url='http://www.pvl.cz/portal/SaP/pc/?',
                         subpages=['oid=1', 'oid=2'],
                         datatype_prefix='streamflow',
                         agency_prefix='pod')

    :param dst_dir: destination directory where to save the data (subdirs are created automatically)
    :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow,
                                               http://www.pvl.cz/portal/srazky/pc/? for precipitation]
    :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3'])
    :param datatype_prefix: the data type. use 'streamflow' or 'precip'
    :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo
    :return: number of charts and html pages downloaded
    """

    #if datatype_prefix == 'streamflow':
        #pvl_base = 'http://sap.poh.cz/portal/SaP/pc/?'
    #else:
        #pvl_base = 'http://sap.poh.cz/portal/Srazky/PC/?'

    session = HTMLSession()
    n_charts = 0

    for subpage in subpages:

        url = base_url + subpage
        print('-----------------------------')
        print(url)
        print('-----------------------------')
        r = session.get(url)

        for lnk in r.html.absolute_links:
            if 'Mereni.aspx?id=' or 'mereni.aspx?id=' in lnk:

                try:

                    r_st = session.get(lnk)

                    images = r_st.html.find('img')
                    for img in images:
                        if 'src' not in img.attrs:
                            continue
                        src = img.attrs['src']
                        if ('graf' in src or 'Graf' in src) and ('miniatury' not in src) and ("&" not in src) and (".ashx" not in src):

                            if 'maska' in src:
                                continue

                            img_src_absolute = urljoin(lnk, src)

                            img_response = get(img_src_absolute)
                            if img_response.status_code == 200:

                                img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(img_src_absolute))[0])
                                if not os.path.exists(img_dir):
                                    os.makedirs(img_dir)
                                utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.png')

                                img_filename = os.path.basename(img_src_absolute).replace('.png', utc_timestamp_text)

                                img_path = os.path.join(img_dir, img_filename)
                                print(img_path)
                                with open(img_path, 'wb') as f:
                                    f.write(img_response.content)

                                # also save the HTML
                                html_path = img_path.replace('.png', '.html')
                                html_response = get(lnk)
                                if html_response.status_code == 200:
                                    print(html_path)
                                    with open(html_path, 'wb') as f:
                                        f.write(html_response.content)

                            n_charts += 1

                except ValueError:
                    print('ERROR fetching ' + lnk)
    return n_charts
Example #15
0
 def start_request(self, url):
     session = HTMLSession()
     header = dict()
     header['user-agent'] = random.choice(self.USER_AGENT_LIST)
     response = session.get(url, headers=header)
     return response
Example #16
0
def parse_team(team_name, team_url):
    session = HTMLSession()
    return {'id': team_url.split('id=')[-1], 'name': team_name, 'heros': [{'win_rate': div.text, 'heros': [span.attrs['title'].replace('soldier76', 'soldier-76') for span in div.find('span')]}
                                                                          for div in session.get(team_url).html.find('.team-comp-wrapper > .team-comp')]}
Example #17
0
from requests_html import HTMLSession
session = HTMLSession()
r = session.get("http://money.finance.sina.com.cn/corp/go.php/" +
                      "vMS_MarketHistory/stockid/601006.phtml?year=2018&jidu=2")
table = r.html.xpath("//*[@id='FundHoldSharesTable']")[0]
trArray = table.xpath("//tr")
trArray = trArray[2:len(trArray)]
gpList = list()
for tr in trArray:
    gp = list()
    for td in tr.xpath("//td"):
        gp.append(td.text)
    gpList.append(gp)
for gp in gpList:
    print(gp)




Example #18
0
# -*- coding:utf8 -*-
# @Time:2021/10/21 10:15 上午
# @Author: Huang Jeff

from requests_html import HTMLSession

name = "猫"
url = f"https://unsplash.com/s/photos/{name}"

session = HTMLSession()

result = session.get(url)

print(result.status_code)
print(
    result.html.xpath('//figure[@itemprop="image"]//a[@rel="nofollow"]/@href'))

# down_list = result.html.xpath('//figure[@itemprop="image"]//a[@rel="nofollow"]/@href')

# def get_picID_from_url():

# def down_one_pic(url):
#     result = session.get(url)
#     filename = get_picID_from_url(url)
#     with open(filename, "wb") as f:
#     f.write(result.content)

# for one_url in down_list:
#     down_one_pic(one_url)
Example #19
0
def update_econproj(url, baseline, text_args):
    """
    Function that will read new CBO economic projections and update
    CBO_baseline.csv accordingly
    Parameters
    ----------
    url: URL linking to IRS website with projections of federal tax filings
    baseline: CBO baseline we're updaint
    text_args: Dictionary containing the arguments that will be passed to
        the documentation template
    Returns
    -------
    baseline: Updated baseline numbers
    text_args: Updated dictionary with text aruments to fill in the template
    """
    print("Updating CBO Economic Projections")
    # pull all of the latest CBO reports and use them for needed updates
    session = HTMLSession()
    r = session.get(url)
    divs = r.html.find("div.view.view-recurring-data")
    revprojections = divs[4]
    # both assertions are there to throw errors if the order of sections change
    # revenue projections used for capital gains projections
    assert "Revenue Projections" in revprojections.text
    latest_revprojections = revprojections.find("div.views-col.col-1")[0]
    rev_link = latest_revprojections.find("a")[0]
    _rev_report = datetime.strptime(rev_link.text, "%b %Y")
    rev_report = datetime.strftime(_rev_report, "%B %Y")
    rev_url = rev_link.attrs["href"]

    econprojections = divs[8]
    assert "10-Year Economic Projections" in econprojections.text
    latest_econprojections = econprojections.find("div.views-col.col-1")[0]
    econ_link = latest_econprojections.find("a")[0]
    _cbo_report = datetime.strptime(econ_link.text, "%b %Y")
    cbo_report = datetime.strftime(_cbo_report, "%B %Y")
    econ_url = econ_link.attrs["href"]

    if cbo_report == text_args["current_cbo"]:
        print("\tNo new data since last update")
    else:
        # read in economic projections
        econ_proj = pd.read_excel(econ_url,
                                  sheet_name="2. Calendar Year",
                                  skiprows=6,
                                  index_col=[0, 1, 2, 3])
        # extract values for needed rows in the excel file
        # some variables have a missing value in the multi-index. Use iloc
        # to extract needed variables from them.
        gdp = econ_proj.loc["Output"].loc["Gross Domestic Product (GDP)"].iloc[
            0]
        income = econ_proj.loc["Income"]
        tpy = income.loc["Income, Personal"].iloc[0]
        wages = income.loc["Wages and Salaries"].iloc[0]
        billions = "Billions of dollars"
        var = "Proprietors' income, nonfarm, with IVA & CCAdj"
        schc = income.loc["Nonwage Income"].loc[var].loc[billions]
        var = "Proprietors' income, farm, with IVA & CCAdj"
        schf = income.loc["Nonwage Income"].loc[var].loc[billions]
        var = "Interest income, personal"
        ints = income.loc["Nonwage Income"].loc[var].loc[billions]
        var = "Dividend income, personal"
        divs = income.loc["Nonwage Income"].loc[var].loc[billions]
        var = "Income, rental, with CCAdj"
        rents = income.loc["Nonwage Income"].loc[var].loc[billions]
        book = income.loc["Profits, Corporate, With IVA & CCAdj"].iloc[0]
        var = "Consumer Price Index, All Urban Consumers (CPI-U)"
        cpiu = econ_proj.loc["Prices"].loc[var].iloc[0]
        var_list = [gdp, tpy, wages, schc, schf, ints, divs, rents, book, cpiu]
        var_names = [
            "GDP",
            "TPY",
            "Wages",
            "SCHC",
            "SCHF",
            "INTS",
            "DIVS",
            "RENTS",
            "BOOK",
            "CPIU",
        ]
        df = pd.DataFrame(var_list, index=var_names).round(1)
        df.columns = df.columns.astype(str)
        df_cols = set(df.columns)
        baseline_cols = set(baseline.columns)
        for col in df_cols - baseline_cols:
            baseline[col] = None
        baseline.update(df)

        text_args["previous_cbo"] = text_args["current_cbo"]
        text_args["current_cbo"] = cbo_report

    if rev_report == text_args["cgns_prev_report"]:
        print("\tNo new data since last update")
        return baseline, text_args
    elif rev_link.text == "Mar 2020":
        msg = (
            "\nCapital gains realizations are not included in CBO's March 2020"
            " revenue projections publication. It's unclear if this is a "
            "permanent change or due to the pandemic. For now, we will skip "
            "this update and re-evaluate when they release their next "
            "projections.\n")
        print(msg)
        return baseline, text_args
    else:
        # Extract capital gains data
        cg_proj = pd.read_excel(
            rev_url,
            sheet_name="6. Capital Gains Realizations",
            skiprows=7,
            header=[0, 1],
        )
        cg_proj.index = cg_proj[cg_proj.columns[0]]
        var = "Capital Gains Realizationsa"
        cgns = cg_proj[var]["Billions of Dollars"].loc[list(range(2017, 2032))]
        var_list = [cgns]
        var_names = ["CGNS"]
        df = pd.DataFrame(var_list, index=var_names).round(1)
        df.columns = df.columns.astype(str)
        # update baseline file with the new data

        # add a column for any years that are in the update but not yet in the
        # CBO baseline file before updating the values
        df_cols = set(df.columns)
        baseline_cols = set(baseline.columns)
        for col in df_cols - baseline_cols:
            baseline[col] = None
        baseline.update(df)

        text_args["cgns_prev_report"] = text_args["cgns_cur_report"]
        text_args["cgns_prev_url"] = text_args["cgns_cur_url"]
        text_args["cgns_cur_report"] = rev_report
        text_args["cgns_cur_url"] = rev_url

    return baseline, text_args
Example #20
0
from requests_html import HTMLSession
import json
from pprint import pprint
import io

session = HTMLSession()
r = session.get('https://www.qiushibaike.com/text/')
# # 网页文本
# print(r.html.html)
#
# # 获取链接
# print(r.html.links)
# print(r.html.absolute_links)

# # 首页菜单文本
# print(r.html.find('div#menu', first=True).text)
# # 首页菜单元素
# print(r.html.find('div#menu a', first=True))
# # 段子内容
# print(list(map(lambda x: x.text, r.html.find('div.content span'))))

# print(r.html.xpath("//div[@id='menu']", first=True).text)
# print(r.html.xpath("//div[@id='menu']/a"))
# print(r.html.xpath("//div[@class='content']/span/text()"))

# 获取元素
# e = r.html.find("div#hd_logo", first=True)
# print(e.text)
# print(e.attrs)
# print(e.absolute_links)
# print(e.links)
Example #21
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = 'ipetrash'


# pip install requests-html
from requests_html import HTMLSession


session = HTMLSession()
rs = session.get('https://coronavirus-monitor.ru/statistika/')
with open('rs_before_js.html', 'w', encoding='utf-8') as f:
    f.write(rs.html.html)

rs.html.render()  # Без этого не будет выполнения js кода

with open('rs_after_js.html', 'w', encoding='utf-8') as f:
    f.write(rs.html.html)
#!/usr/bin/python
# -*- coding:utf-8 -*-


from requests_html import HTMLSession
import json

dict2json = {}
session = HTMLSession()
r = session.get('https://ckb.jax.org/gene/grid')
print(len(r.html.links))

gene_link_dic = {gene.text:tuple(gene.absolute_links)[0] for gene in r.html.find('div.container-fluid div:nth-child(3) a')}
del gene_link_dic['']
del gene_link_dic['Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License']
for gene, link in gene_link_dic.items():
    r = session.get(link)
    if 'Additional content available in ' in r.html.html:
        pass
    else:
        selector2tr = 'div.container-fluid div:nth-child(4) div.col-lg-12 div.tab-content div#geneVariants tbody tr'
        gene_variants_tr_list = r.html.find(selector2tr)
        # gene_variants_tr_html = HTML(gene_variants_tr_list)
        dict2json[gene] = []
        for variant in gene_variants_tr_list:
            variant_info_list = variant.find('td')
            dict2json[gene].append({'Variant':variant_info_list[0].text, 'Imapct':variant_info_list[1].text, 'Protein Effect':variant_info_list[2].text, 'Variant Description':variant_info_list[3].text, 'Associated with drug Resistance':variant_info_list[4].text})

with open('gene_data.json', 'w') as f:
    json.dump(dict2json, f)
Example #23
0
class Spider(object):
    """
    sina.com.cn Spider
    """
    def __init__(self, db):
        self.url = args.url
        self.depth = args.depth
        self.keyword = args.keyword
        self.db = db
        self.analysed_url = {}
        self.url_visiting = {}
        self.url_keyword = {}
        self.lock = threading.Lock()
        self.rlock = threading.RLock()
        self._session = HTMLSession()
        self._thread_pool = ThreadPool(args.thread, fn=self.analyse)
        self._thread_pool.sumbit((args.url, 0))
        logging.debug('start Spider url={} keyword={} depth={}'.format(
            self.url, self.keyword, self.keyword, self.depth))

    def analyse(self, task):
        """
        get html response
        :param task:   (url, level)
        :return:
        """
        url, level = task
        if not self.is_visited(url, level):
            logging.debug(
                'get task {} which has been visited. Please check mutex use right or not'
                .format(task))
            return
        try:
            response = self._session.get(url)
            logging.debug('GET {}'.format(url))

        except Exception as e:
            logging.error("requests error %s" % e)
        else:
            content_type = self.content_type(response)
            if '/html' in content_type:
                has_key = self.has_keyword(response)
                if has_key:
                    try:
                        self.lock.acquire()
                        self.url_keyword[url] = level
                        logging.info('url {} with keyword={}'.format(
                            url, self.keyword))
                    except Exception as e:
                        logging.error(e)
                    finally:
                        self.lock.release()
                links = self.extract_link(response)
                self.submit_links2queue(links, level + 1)
            else:
                logging.warning('url {} Content-Type={} not supported'.format(
                    url, content_type))
        finally:
            self.add_url2analysed(url, level)

    def is_visited(self, url, level):
        # 判断urk是否抓取过
        flag = False
        try:
            self.lock.acquire()
            if url not in self.url_visiting and url not in self.analysed_url:
                self.url_visiting[url] = level
                flag = True
        except Exception as e:
            pass
        finally:
            self.lock.release()
        return flag

    def content_type(self, r: HTMLResponse) -> str:
        # 网页类型
        return r.headers.get('Content-Type', '')

    def has_keyword(self, r):
        # 是否含关键词
        text = ""
        try:
            text = r.html.text
        except Exception as e:
            logging.error(e)

        if self.keyword in text:
            logging.debug('{} in url {}'.format(self.keyword, r.url))
            return True
        else:
            logging.debug('{} not in url {}'.format(self.keyword, r.url))
            return False

    def submit_links2queue(self, links, level):
        """
        向队列中添加新任务
        :param links: 本次爬取页面中的所有链接
        :param level: 本次页面所处深度
        :return:
        """
        if level > self.depth:
            logging.debug('links {} beyond max_depth={}'.format(links, level))
            return
        for link in links:
            if link.startswith(self.url):  # 只搜索同一个域下的内容
                if link not in self.url_visiting and link not in self.analysed_url:
                    # 提交到队列中
                    self._thread_pool.sumbit((link, level))

    def add_url2analysed(self, url, level):
        """
        将 url 加入到已经分析的 url 字典中
        :param url:
        :return:
        """
        try:
            self.lock.acquire()
            # 先加到 analysed_url 再从 url_visiting 删除,防止重复爬取
            self.analysed_url[url] = level
            del self.url_visiting[url]
        except Exception as e:
            logging.error(e)
        finally:
            self.lock.release()
            logging.info('finish url {} in level {}'.format(url, level))

    def extract_link(self, r: HTMLResponse):
        """
        提取 html 中的链接,需要分析相对链接和绝对链接
        :return:
        """
        relative_links = r.html.links - r.html.absolute_links
        all_links = r.html.absolute_links | self.relative2absolute(
            relative_links, r.html.base_url)
        logging.info('links {} in url {}'.format(all_links, r.url))
        return all_links

    def relative2absolute(self, relative_links, base_url):
        """
        将相对链接拼接成绝对链接
        :param relative_links:
        :param base_url:
        :return:
        """
        return {base_url + x for x in relative_links}

    def has_finished(self):
        """
        查询当前爬虫任务是否已经完成
        :return:
        """
        f = self._thread_pool.is_over()
        if not f:
            self.persist2db()
            return f
        return f

    def persist2db(self):
        """
        将 url 持久化到 sqlite db
        :return:
        """
        with self.rlock:
            self.rlock.acquire()
            t = self.url_keyword
            self.url_keyword = {}

        for url in t:
            content = self._session.get(url).content.decode("utf8", "ignore")
            self.db.insert(self.keyword, url, content)

    def progress(self) -> tuple:
        """
        反馈进度消息
        :return:
        """
        return self._thread_pool.progress()

    def add_url_with_keyword(self, url, level):
        """
        发现页面包含关键字的 url
        :param url:
        :param level:
        :return:
        """
        with self.rlock:
            self.url_keyword[url] = level
        logging.info('url {} with keyword={}'.format(url, self.keyword))
Example #24
0
########################################\n\
\033[0m')

urllist = [
    'https://www.nicovideo.jp/tag/VOCAL_Character%E3%83%A9%E3%83%B3%E3%82%AD%E3%83%B3%E3%82%B0?sort=f&order=d'
]
# for i in range(1,13):
#     urllist.append('https://www.nicovideo.jp/tag/週刊VOCALOIDとUTAUランキング?sort=f&order=d&page='+str(i))

mylist = {}
with open('../json/episodelist.json') as json_file:
    mylist = json.load(json_file)

for url in urllist:
    print(url)
    r = session.get(url)
    # seltit = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent > p > a'
    # seldes = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent > div.wrap > p.itemDescription'
    sel = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent'

    vresults = r.html.find(sel)

    for vresult in vresults:
        if len(vresult.find('a')) <= 0 or len(
                vresult.find('p.itemDescription')) <= 0:
            continue

        vtit = list(vresult.find('a'))[0]
        vdes = list(vresult.find('p.itemDescription'))[0]
        if len(vtit.absolute_links) <= 0: continue
Example #25
0
def scrap(given_name: str, given_url, given_model_no=None):
    """
    :param given_model_no:
    :param given_name:
    :param given_url:
    :return: List of Scraped data, Data error count and Keyword
    """
    if given_model_no is not None:
        links = get_links(given_name, given_url, given_model_no)
    else:
        links = get_links(given_name, given_url)

    if len(links) < 1:
        return []

    data_list = []
    number = 1
    for link in links:
        print(f'Getting data from link {number} of {len(links)}...')
        url = link.find('.name.fn.l_mgn-tb-sm.l_dsp-blc')[0].attrs['href']
        session = HTMLSession()
        r = session.get(url)

        number += 1
        try:
            t1 = datetime.now()

            try:
                title = clean_text(r.html.find('.product-name')[0].text)
                sku = r.html.find('.product-id.meta.quiet.p_txt-sm')[-1].text
            except IndexError:
                continue
            except Exception as e:
                n = e
                continue

            try:
                prd_price = clean_price(
                    r.html.find('.price-device>script')[0].text)
            except Exception as e:
                n = e
                # print(f'\n{e} price\n{title}\n\n')
                prd_price = '0'

            try:
                merchant = clean_text(r.html.find('#sellerProfile')[0].text)
            except Exception as e:
                n = e
                merchant = 'NA'

            timestamp = datetime.now()
            main = {
                'name': title,
                'price': prd_price,
                'timestamp': timestamp,
                'merchant': merchant,
                'time': (datetime.now() - t1).total_seconds(),
                'url': url,
                'sku': sku,
            }
            data_list.append(main)
        except AttributeError:
            pass

    return data_list
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from requests_html import HTMLSession
from urllib.request import urlopen as uReq
session = HTMLSession()
r = session.get("https://www.wma.net/publications/wma-annual-report/")
html = r.html.html
pageSoup = soup(html, "html.parser")
pdf_soup = pageSoup.find_all("a", {"target": "_blank"})
for i in range(len(pdf_soup)):
    print(
        "-----------------------------------------------------------------------------------------------------------"
    )
    print(pdf_soup[i].get('href'))
Example #27
0
def runScraper():
    storeCities()

    conn = connect()

    #the cities table contains around 480 cities, all of the craigslist pages in north america
    curs = conn.cursor()
    curs.execute("SELECT * FROM cities")

    citiesList = []
    for city in curs.fetchall():
        citiesList.append(city)

    curs.execute(
        '''CREATE TABLE IF NOT EXISTS vehicles(id BIGINT PRIMARY KEY, url TEXT, region TEXT, region_url TEXT, 
    price BIGINT, year BIGINT, manufacturer TEXT, model TEXT, condition TEXT, cylinders TEXT, fuel TEXT, 
    odometer BIGINT, title_status TEXT, transmission TEXT, VIN TEXT, drive TEXT, size TEXT, type TEXT, paint_color TEXT, image_url TEXT, 
    description TEXT, county TEXT, state TEXT, lat REAL, long REAL)''')

    session = HTMLSession()

    #scraped counts all entries gathered
    scraped = 0

    #carBrands dictate what qualifies as a brand so we can snatch that data from the 'model' tag
    carBrands = [
        "ford", "toyota", "chevrolet", "chev", "chevy", "honda", "jeep",
        "hyundai", "subaru", "kia", "gmc", "ram", "dodge", "mercedes-benz",
        "mercedes", "mercedesbenz", "volkswagen", "vw", "bmw", "saturn",
        "land rover", "landrover", "pontiac", "mitsubishi", "lincoln", "volvo",
        "mercury", "harley-davidson", "harley", "rover", "buick", "cadillac",
        "infiniti", "infinity", "audi", "mazda", "chrysler", "acura", "lexus",
        "nissan", "datsun", "jaguar", "alfa", "alfa-romeo", "aston",
        "aston-martin", "ferrari", "fiat", "hennessey", "porche", "noble",
        "morgan", "mini", "tesla"
    ]

    #if the car year is beyond next year, we toss it out. this variable is used later
    nextYear = datetime.now().year + 1

    #simple txt file mechanism to track scraping progress
    fileName = os.path.dirname(
        os.path.abspath(__file__)) + "/static/trackVehicleScraping.txt"
    exists = os.path.isfile(fileName)
    if not exists:
        tracker = open(fileName, "w")
        tracker.write("0")
        tracker.close()

    with open(fileName, "r") as tracker:
        cities = int(tracker.readlines()[0])
    citiesCount = len(citiesList)
    citiesList = citiesList[cities:]

    for city in citiesList:
        scrapedInCity = 0
        cities += 1
        print(
            f"Scraping vehicles from {city[2]}, {citiesCount - cities} cities remain"
        )
        empty = False

        #scrapedIds is used to store each individual vehicle id from a city, therefore we can delete vehicle records from the database
        #if their id is no longer in scrapedIds under the assumption that the entry has been removed from craigslist
        scrapedIds = set([])

        #track items skipped that are already in the database
        skipped = 0

        #this loop executes until we are out of search results, craigslist sets this limit at 3000 and cities often contain the full 3000 records (but not always)
        while not empty:
            print(
                f"Gathering entries {scrapedInCity} through {scrapedInCity + 120}"
            )

            #now we scrape
            try:
                searchUrl = f"{city[1]}/d/cars-trucks/search/cta?s={scrapedInCity}"
                page = session.get(searchUrl)
            except Exception as e:
                #catch any excpetion and continue the loop if we cannot access a site for whatever reason
                print(
                    f"Failed to reach {searchUrl}, entries have been dropped: {e}"
                )
                scrapedInCity += 120
                continue

            #each search page contains 120 entries
            scrapedInCity += 120
            tree = html.fromstring(page.content)

            #the following line returns a list of urls for different vehicles
            vehicles = tree.xpath('//a[@class="result-image gallery"]')
            if len(vehicles) == 0:
                #if we no longer have entries, continue to the next city
                empty = True
                continue
            vehiclesList = []
            for item in vehicles:
                vehicleDetails = []
                vehicleDetails.append(item.attrib["href"])
                try:
                    #this code attempts to grab the price of the vehicle. some vehicles dont have prices (which throws an exception)
                    #and we dont want those which is why we toss them
                    vehicleDetails.append(item[0].text)
                except:
                    continue
                vehiclesList.append(vehicleDetails)

            #loop through each vehicle
            for item in vehiclesList:
                url = item[0]
                try:
                    idpk = int(url.split("/")[-1].strip(".html"))
                except ValueError as e:
                    print("{} does not have a valid id: {}".format(url, e))

                #add the id to scrapedIds for database cleaning purposes
                scrapedIds.add(idpk)

                #vehicle id is a primary key in this database so we cant have repeats. if a record with the same url is found, we continue
                #the loop as the vehicle has already been stored
                curs.execute(f"SELECT 1 FROM vehicles WHERE id = {idpk}")
                if len(curs.fetchall()) != 0:
                    skipped += 1
                    continue

                vehicleDict = {}
                vehicleDict["price"] = int(item[1].strip("$"))

                try:
                    #grab each individual vehicle page
                    page = session.get(url)
                    tree = html.fromstring(page.content)
                except:
                    print(f"Failed to reach {url}, entry has been dropped")
                    continue

                attrs = tree.xpath('//span//b')
                #this fetches a list of attributes about a given vehicle. each vehicle does not have every specific attribute listed on craigslist
                #so this code gets a little messy as we need to handle errors if a car does not have the attribute we're looking for
                for item in attrs:
                    try:
                        #model is the only attribute without a specific tag on craigslist, so if this code fails it means that we've grabbed the model of the vehicle
                        k = item.getparent().text.strip()
                        k = k.strip(":")
                    except:
                        k = "model"
                    try:
                        #this code fails if item=None so we have to handle it appropriately
                        vehicleDict[k] = item.text.strip()
                    except:
                        continue

                #we will assume that each of these variables are None until we hear otherwise
                #that way, try/except clauses can simply pass and leave these values as None
                price = None
                year = None
                manufacturer = None
                model = None
                condition = None
                cylinders = None
                fuel = None
                odometer = None
                title_status = None
                transmission = None
                VIN = None
                drive = None
                size = None
                vehicle_type = None
                paint_color = None
                image_url = None
                lat = None
                long = None
                description = None

                #now this code gets redundant. if we picked up a specific attr in the vehicleDict then we can change the variable from None.
                #integer attributes (price/odometer) are handled in case the int() is unsuccessful, but i have never seen that be the case
                if "price" in vehicleDict:
                    try:
                        price = int(vehicleDict["price"])
                    except Exception as e:
                        print(f"Could not parse price: {e}")
                if "odomoter" in vehicleDict:
                    try:
                        odometer = int(vehicleDict["odometer"])
                    except Exception as e:
                        print(f"Could not parse odometer: {e}")
                if "condition" in vehicleDict:
                    condition = vehicleDict["condition"]
                if "model" in vehicleDict:
                    #model actually contains 3 variables that we'd like: year, manufacturer, and model (which we call model)
                    try:
                        year = int(vehicleDict["model"][:4])
                        if year > nextYear:
                            year = None
                    except:
                        year = None
                    model = vehicleDict["model"][5:]
                    foundManufacturer = False
                    #we parse through each word in the description and search for a match with carBrands (at the top of the program)
                    #if a match is found then we have our manufacturer, otherwise we set model to the entire string and leave manu blank
                    for word in model.split():
                        if word.lower() in carBrands:
                            foundManufacturer = True
                            model = ""
                            #resolve conflicting manufacturer titles
                            manufacturer = word.lower()
                            if manufacturer == "chev" or manufacturer == "chevy":
                                manufacturer = "chevrolet"
                            if manufacturer == "mercedes" or manufacturer == "mercedesbenz":
                                manufacturer = "mercedes-benz"
                            if manufacturer == "vw":
                                manufacturer = "volkswagen"
                            if manufacturer == "landrover":
                                manufacturer = "land rover"
                            if manufacturer == "harley":
                                manufacturer = "harley-davidson"
                            if manufacturer == "infinity":
                                manufacturer = "infiniti"
                            if manufacturer == "alfa":
                                manufacturer = "alfa-romeo"
                            if manufacturer == "aston":
                                manufacturer = "aston-martin"
                            continue
                        if foundManufacturer:
                            model = model + word.lower() + " "
                    model = model.strip()
                if "cylinders" in vehicleDict:
                    cylinders = vehicleDict["cylinders"]
                if "fuel" in vehicleDict:
                    fuel = vehicleDict["fuel"]
                if "odometer" in vehicleDict:
                    odometer = vehicleDict["odometer"]
                if "title status" in vehicleDict:
                    title_status = vehicleDict["title status"]
                if "transmission" in vehicleDict:
                    transmission = vehicleDict["transmission"]
                if "VIN" in vehicleDict:
                    VIN = vehicleDict["VIN"]
                if "drive" in vehicleDict:
                    drive = vehicleDict["drive"]
                if "size" in vehicleDict:
                    size = vehicleDict["size"]
                if "type" in vehicleDict:
                    vehicle_type = vehicleDict["type"]
                if "paint color" in vehicleDict:
                    paint_color = vehicleDict["paint color"]

                #now lets fetch the image url if exists

                try:
                    img = tree.xpath(
                        '//div[@class="slide first visible"]//img')
                    image_url = img[0].attrib["src"]
                except:
                    pass

                #try to fetch lat/long and city/state, remain as None if they do not exist

                try:
                    location = tree.xpath("//div[@id='map']")
                    lat = float(location[0].attrib["data-latitude"])
                    long = float(location[0].attrib["data-longitude"])
                except Exception as e:
                    pass

                #try to fetch a vehicle description, remain as None if it does not exist

                try:
                    location = tree.xpath("//section[@id='postingbody']")
                    description = location[0].text_content().replace(
                        "\n", " ").replace("QR Code Link to This Post",
                                           "").strip()
                except:
                    pass

                #finally we get to insert the entry into the database
                curs.execute(
                    '''INSERT INTO vehicles(id, url, region, region_url, price, year, manufacturer, model, condition,
                cylinders, fuel,odometer, title_status, transmission, VIN, drive, size, type, 
                paint_color, image_url, description, lat, long, state)
                VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',
                    (idpk, url, city[2], city[1], price, year, manufacturer,
                     model, condition, cylinders, fuel, odometer, title_status,
                     transmission, VIN, drive, size, vehicle_type, paint_color,
                     image_url, description, lat, long, city[3]))

                scraped += 1
            #these lines will execute every time we grab a new page (after 120 entries)
            print("{} vehicles scraped".format(scraped))

        #now to clean the database we grab all urls from the city that are already logged
        curs.execute("SELECT id FROM vehicles WHERE region_url = '{}'".format(
            city[1]))
        deleted = 0

        #if a given id is not in scrapedIds (the ids that we just scraped) then the entry no longer exists and we remove it
        for oldId in curs.fetchall():
            if int(oldId[0]) not in scrapedIds:
                curs.execute("DELETE FROM vehicles WHERE id = '{}'".format(
                    oldId[0]))
                deleted += 1
        print(
            "Deleted {} old records, {} records skipped as they are already stored"
            .format(deleted, skipped))
        conn.commit()

        #update progress file
        with open(fileName, "w") as tracker:
            tracker.write(str(cities))

    #delete tracker file
    os.remove(fileName)
    count = curs.execute("SELECT Count(*) FROM vehicles")
    print("Table vehicles successfully updated, {} entries exist".format(\
        curs.fetchall()[0][0]))
    conn.close()
Example #28
0
def update_socsec(url, baseline, text_args):
    """
    Function that will read the table with OASI Social Security Projections
    Parameters
    ----------
    url: URL linking to IRS website with projections of federal tax filings
    baseline: CBO baseline we're updaint
    text_args: Dictionary containing the arguments that will be passed to
        the documentation template
    Returns
    -------
    baseline: Updated baseline numbers
    text_args: Updated dictionary with text aruments to fill in the template
    """
    print("Updating Social Security Projections")
    session = HTMLSession()
    r = session.get(url)
    # we can determine the latest year by looking at all of the years availeble
    # in the first drop down and adding one.
    selector = r.html.find("select#yh1")[0]
    latest_yr = max([int(yr) for yr in selector.text.split()]) + 1
    report = f"{latest_yr} Report"
    if report == text_args["socsec_cur_report"]:
        print("\tNo new data since last update")
        return baseline, text_args

    socsec_url = f"https://www.ssa.gov/oact/TR/{latest_yr}/VI_C_SRfyproj.html"
    match_txt = "Operations of the OASI Trust Fund, Fiscal Years"
    html = pd.read_html(socsec_url, match=match_txt)[0]
    # merge the columns with years and data into one
    sub_data = pd.concat(
        [
            html["Fiscal year", "Fiscal year.1"], html["Cost",
                                                       "Sched-uled benefits"]
        ],
        axis=1,
    )
    sub_data.columns = ["year", "cost"]
    # further slim down data so that we have the intermediate costs only
    start = sub_data.index[sub_data["year"] == "Intermediate:"][0]
    end = sub_data.index[sub_data["year"] == "Low-cost:"][0]
    cost_data = sub_data.iloc[start + 1:end].dropna()
    cost_data["cost"] = cost_data["cost"].astype(float)
    # rate we'll use to extrapolate costs to final year we'll need
    pct_change = cost_data["cost"].pct_change() + 1
    cost_data.set_index("year", inplace=True)
    cost_data = cost_data.transpose()
    cost_data.index = ["SOCSEC"]
    # create values for years not included in the report
    factor = pct_change.iloc[-1]
    last_year = int(max(cost_data.columns))
    cbo_last_year = int(max(baseline.columns))
    for year in range(last_year + 1, cbo_last_year + 1):
        cost_data[str(year)] = cost_data[str(year - 1)] * factor
    cost_data = cost_data.round(1)
    # finally update CBO projections
    baseline.update(cost_data)

    text_args["socsec_prev_report"] = text_args["socsec_cur_report"]
    text_args["socsec_prev_url"] = text_args["socsec_cur_url"]
    text_args["socsec_cur_report"] = report
    text_args["socsec_cur_url"] = socsec_url

    return baseline, text_args
Example #29
0
File: 13.py Project: fisher335/ls
#-*- coding: gbk -*-
# @Date    : '2018/3/27 0027'
# @Author  : Terry feng  ([email protected])
from requests_html import HTMLSession

client = HTMLSession()
r = client.get("https://www.qiushibaike.com/text/")
a = r.html.find(".content")
for i in a:
    print(i.text)
    print("--------------------------")
Example #30
0
class LostFilmParser:
    source_url = 'https://www.lostfilm.tv/'
    tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php'
    part_step = 10

    def __init__(self):
        self.session = HTMLSession()
        self.news_data = self.session.get(self.source_url)

    def get_links(self):
        return self.news_data.html.links

    def get_title_en(self, href):
        try:
            result = search(r'/series/([^/]+)/', href)
            title_en = result.group(1)
            tv_show_link = self.source_url.rstrip('/') + result.group()
        except AttributeError:
            title_en = None
            tv_show_link = None
        return title_en, tv_show_link

    def get_new_shows_episodes(self):
        clear_data = []
        news_block = self.news_data.html.find('.new-movies-block', first=True)
        movies = news_block.find('a.new-movie')
        for movie in movies:
            title_en, show_link = self.get_title_en(movie.attrs['href'])
            clear_data.append(
                {
                    'title_ru': movie.attrs['title'],
                    'title_en': title_en,
                    'jpg': 'http:' + movie.find('img', first=True).attrs['src'],
                    'season': movie.find('.title', first=True).text,
                    'date': movie.find('.date', first=True).text,
                    'episode_link': self.source_url.rstrip('/') + movie.attrs['href'],
                    'tv_show_link': show_link,
                }
            )
        return clear_data

    def load_part_list(self, step):
        url = self.source_url + 'ajaxik.php'
        request_data = self.session.post(
            url=url,
            data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'}
            )
        return json.loads(request_data.content)['data']

    def get_tv_shows_list(self):
        """10->20->30-> пока не вернет пустой список"""
        step = 0
        shows_list = []
        request_result = self.load_part_list(step)
        while request_result:
            for result in request_result:
                shows_list.append(result)
            step += self.part_step
            sleep(1)
            request_result = self.load_part_list(step)
        return shows_list
Example #31
0
# input&read
logger.debug("# Read the CIK and acc_no from console")
cik = input("Please input the CIK:")
acc_no_test = input("Please input the document accession number:")
print('The CIK and Acc_no you entered is:', cik, acc_no_test)

# CIK = '51143'
# acc_no = '000005114313000007/0000051143-13-000007'
logger.debug("# Get the HTML page")
CIK = cik
acc_no = acc_no_test
html_tail = '-index.html'
url_company = "http://www.sec.gov/Archives/edgar/data/" + CIK + "/" + acc_no + html_tail

r1 = session.get(url_company)

url_10q = ""

# match 10q page
logger.debug("# Get the 10Q page")
for url in r1.html.absolute_links:
    if re.match(r'[a-zA-z]+://[^\s]*.10q.htm', url) != None:
        url_10q = url
        break

# open 10q page
r2 = session.get(url_10q)

# find html element through css selector
# r2.html.find('table')
Example #32
0
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://prettyprinted.com')

# print (r.html.links)
print(r.html.absolute_links)
print('\n')
print(r.html.find('.headline', first=True))
headline = r.html.find('.headline', first=True)
print('\n')
print(headline.text)
print('\n')

r = session.get('https://prettyprinted.com/p/the-flask-extensions-course')

print(r)

print(r.html.find('.course-section', first=True))
print('\n')
flask_wtf_section = r.html.find('.course-section', first=True)

print(flask_wtf_section)
print('\n')
print(flask_wtf_section.find('.item'))
items = flask_wtf_section.find('.item')
print('\n')

for item in items:

    print(item.text)
Example #33
0
from requests_html import HTML,HTMLSession

session = HTMLSession()

r = session.get('https://coreyms.com')

article = r.html.find('article', first=True) # cari element dengan class article
headline = article.find('.entry-title-link', first=True).text
print(headline)
summary = article.find('.entry-content p', first=True).text # cari element dengan class entry-content yang ada p
print(summary)

vid_src = article.find('iframe', first=True).attrs['src']
# print(vid_src.attrs['src'])
vid_id = vid_src.split('/')[4]
vid_id = vid_id.split('?')[0]
vid_id = f'https://youtube.com/watch?={vid_id}'
print(vid_id)

# Untuk menampilkan semua class article
articles = r.html.find('article') # cari element dengan class article
for article in articles:
    headline = article.find('.entry-title-link', first=True).text
    print(headline)
    summary = article.find('.entry-content p', first=True).text # cari element dengan class entry-content yang ada p
    print(summary)
    
    try:
        vid_src = article.find('iframe', first=True).attrs['src']
        # print(vid_src.attrs['src'])
        vid_id = vid_src.split('/')[4]
Example #34
0
async def on_message(message):
    print(
        f"{message.channel}: {message.author}: {message.author.name}: {message.content}"
    )
    sentdex_guild = client.get_guild(405403391410438165)
    author_roles = message.author.roles
    #print(author_roles)
    #author_role_ids = [r.id for r in author_roles]

    if random.choice(range(500)) == 30:
        matches = [r for r in author_roles if r.id in vanity_role_ids]
        #print(matches)

        if len(matches) == 0:
            try:
                role_id_choice = random.choice(vanity_role_ids)
                actual_role_choice = sentdex_guild.get_role(role_id_choice)
                #print(type(message.author))
                author_roles.append(actual_role_choice)
                await message.author.edit(roles=author_roles)
            except Exception as e:
                print('EDITING ROLES ISSUE:', str(e))

    with open(f"{path}/msgs.csv", "a") as f:
        if message.author.id not in chatbots:
            f.write(
                f"{int(time.time())},{message.author.id},{message.channel}\n")

    with open(f"{path}/log.csv", "a") as f:
        if message.author.id not in chatbots:
            try:
                f.write(
                    f"{int(time.time())},{message.author.id},{message.channel},{message.content}\n"
                )
            except Exception as e:
                f.write(f"{str(e)}\n")

    if "sentdebot.member_count()" == message.content.lower():
        await message.channel.send(f"```py\n{sentdex_guild.member_count}```")

    elif "sentdebot.community_report()" == message.content.lower(
    ) and message.channel.id in image_chan_ids:
        online, idle, offline = community_report(sentdex_guild)

        file = discord.File(f"{path}/online.png",
                            filename=f"{path}/online.png")
        await message.channel.send("", file=file)

        await message.channel.send(
            f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```'
        )

    elif "sentdebot.p6()" == message.content.lower():
        await message.channel.send(
            f"```\nThe Neural Networks from Scratch videos will resume one day. https://nnfs.io```"
        )

    elif "sentdebot.user_activity()" == message.content.lower(
    ) and message.channel.id in image_chan_ids:  # and len([r for r in author_roles if r.id in admins_mods_ids]) > 0:

        file = discord.File(f"{path}/activity.png",
                            filename=f"{path}/activity.png")
        await message.channel.send("", file=file)

        #await message.channel.send(f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```')

    elif "help(sentdebot)" == message.content.lower(
    ) or "sentdebot.commands()" == message.content.lower():
        await message.channel.send(commands_available)

    # if it doesnt work later.
    #elif "sentdebot.logout()" == message.content.lower() and message.author.id == 324953561416859658:
    elif "sentdebot.logout()" == message.content.lower() and str(
            message.author).lower() == "sentdex#7777":
        await client.close()
    elif "sentdebot.gtfo()" == message.content.lower() and str(
            message.author).lower() == "sentdex#7777":
        await client.close()

    elif "sentdebot.get_history()" == message.content.lower() and str(
            message.author).lower() == "sentdex#7777":

        channel = sentdex_guild.get_channel(channel_ids[0])

        async for message in channel.history(limit=999999999999999):
            if message.author.id == 324953561416859658:
                with open(f"{path}/history_out.csv", "a") as f:
                    f.write(f"{message.created_at},1\n")

    else:
        query = search_term(message.content)
        if query:
            #query = match.group(1)
            print(query)

            qsearch = query.replace(" ", "%20")
            full_link = f"https://pythonprogramming.net/search/?q={qsearch}"
            session = HTMLSession()
            r = session.get(full_link)

            specific_tutorials = [(tut.text, list(tut.links)[0])
                                  for tut in r.html.find("a")
                                  if "collection-item" in tut.html]

            if len(specific_tutorials) > 0:
                return_str = "\n---------------------------------------\n".join(
                    f'{tut[0]}: <https://pythonprogramming.net{tut[1]}>'
                    for tut in specific_tutorials[:3])
                return_str = f"```Searching for '{query}'```\n" + return_str + f"\n----\n...More results: <{full_link}>"

                await message.channel.send(return_str)
            else:
                await message.channel.send(f"""```py
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
NotFoundError: {query} not found```""")
Example #35
0
from requests_html import HTMLSession
session = HTMLSession()

r = session.get('https://reddit.com') 

for html in r.html:
    print(html)

pass
Example #36
0
from bs4 import BeautifulSoup
import requests
from requests_html import HTMLSession
session = HTMLSession()
resp = session.get(
    "https://www.amazon.com/Sceptre-E248W-19203R-Monitor-Speakers-Metallic/dp/B0773ZY26F/ref=sr_1_2?crid=1861TM8A5NDPX&dchild=1&keywords=monitors&qid=1597071906&sprefix=monitors%2Caps%2C364&sr=8-2"
)
resp.html.render(sleep=1, keep_page=True, timeout=20)
soup = BeautifulSoup(resp.html.html, "lxml")
title = soup.find(id="productTitle").get_text().strip()
print(title)
Example #37
0
class XHSpider(Process):
    def __init__(self,url):
        # 重写父类的__init__方法
        super(XHSpider, self).__init__()
        self.url = url

        self.session = HTMLSession()
        self.headers = {
            'Host':'news.daxues.cn',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        self.path = "D:/Photo/"
        self.check_file_path(self.path)

    def check_file_path(self, path):
        '''
        check the file is exists
        '''
        if not os.path.exists(path):
            os.makedirs(path)

    def run(self):
        self.parse_page()

    def send_request(self, url):
        '''
        用来发送请求的方法
        '''
        # 请求出错时,重复求情3次
        i = 0
        while i < 3:
            try:
                print('请求url : ', url)
                # 网页是utf-8编码
                return self.session.get(url, headers = self.headers).html
            except Exception as e:
                print('send_request error : ', str(e))
                i += 1

    def parse_page(self):
        '''
        解析网站源码,使用request-html提取
        '''
        html = self.send_request(self.url)
        imgs = html.find('dl a.p img')
        for img in imgs:
            href = img.attrs['src']
            alt = img.attrs['alt']
            self.save_image('http://news.daxues.cn'+href, alt)


    def save_image(self, url, name):
        '''
        save image
        '''
        content = self.session.get(url, headers=self.headers).content
        with open(self.path+name+'.jpg', 'wb') as f:
            f.write(content)
            f.close()
    def parse(self, url):
        self.url = url
        self.parse_page()
Example #38
0
File: jobs.py Project: thp/urlwatch
 def retrieve(self, job_state):
     from requests_html import HTMLSession
     session = HTMLSession()
     response = session.get(self.navigate)
     return response.html.html
Example #39
0
 def start_request(self, url):
     headers = {'user-agent': random.choice(self.USER_AGENT_LIST)}
     session = HTMLSession()
     response = session.get(url, headers=headers)
     return response
Example #40
0
def fetch_pmo_charts(dst_dir, agency, base_url, subpages, datatype_prefix):
    """
    Fetch graphs and html tables from pmo (Povodi Moravy) water board
    fetch_pmo_charts(dst_dir='/home/jiri/meteodata',
                         agency_prefix='pmo',
                         base_url='http://www.pmo.cz/portal/srazky/en/',
                         subpages=['prehled_tab_1_chp.htm', 'prehled_tab_2_chp.htm', 'prehled_tab_3_chp.htm'],
                         datatype_prefix='precip',
                         agency='pmo')

    :param dst_dir: destination directory where to save the data (subdirs are created automatically)
    :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow,
                                               http://www.pvl.cz/portal/srazky/pc/? for precipitation]
    :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3'])
    :param datatype_prefix: the data type. use 'streamflow' or 'precip'
    :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo
    :return: number of charts and html pages downloaded
    """

    agency = "pmo"

    session = HTMLSession()
    n_charts = 0

    for subpage in subpages:
        url = base_url + subpage
        print('-----------------------------')
        print(url)
        print('-----------------------------')
        r = session.get(url)

        anchors = r.html.find('a')
        a_hrefs = [a for a in r.html.find('a') if "DoMereni" in a.attrs["href"]]
        for a in a_hrefs:
            id = a.attrs["href"].split("'")[1]
            url_html = '{:s}/en/mereni_{:s}.htm'.format(base_url, id)
            print(url_html)

            
            if datatype_prefix == 'precip':
                url_img = '{:s}/grafy/sr{:s}_en.gif'.format(base_url, id)
            else:
                url_img = '{:s}/grafy/{:s}.gif'.format(base_url, id)
            print(url_img)
            img_response = get(url_img)
            if img_response.status_code == 200:
                img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(url_img))[0])
                if not os.path.exists(img_dir):
                    os.makedirs(img_dir)
                utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.gif')
                img_filename = os.path.basename(url_img).replace('.gif', utc_timestamp_text)

                img_path = os.path.join(img_dir, img_filename)
                print(img_path)
                with open(img_path, 'wb') as f:
                    f.write(img_response.content)
                    n_charts += 1

                # also save the HTML
                html_path = img_path.replace('.gif', '.htm')
                html_response = get(url_html)
                if html_response.status_code == 200:
                    print(html_path)
                    with open(html_path, 'wb') as f:
                        f.write(html_response.content)
    return n_charts
Example #41
0
        author = line2.text
        #print('작성자: ', author)

    file_data = OrderedDict()
    file_data['author'] = author
    file_data['post_create_datetime'] = date + " 00:00:00" # 2015-01-01 12:10:00
    file_data['title'] = news_title
    file_data['content'] = content
    file_data['url'] = url
    file_data['publisher'] = publisher
    return file_data

if __name__ == '__main__':
    session = HTMLSession()
    ahnlab_url = 'https://www.ahnlab.com/kr/site/securityinfo/secunews/secuNewsList.do?curPage=1&menu_dist=1&seq=&key=&dir_group_dist=&dir_code=&searchDate='
    r = session.get(ahnlab_url)
    r.html.render()   

    for line in r.html.find('input.secuNewsSeq'):
        value = line.attrs['value']
        news_url = 'https://www.ahnlab.com/kr/site/securityinfo/secunews/secuNewsView.do?curPage=1&menu_dist=1&seq='+value+'&key=&dir_group_dist=&dir_code=&searchDate='
        #print('news_url:', news_url)


        #SQL에서 URL 중복 체크
        sql = "select EXISTS (select * from raw_table WHERE url=%s) as success"
        val = (news_url)
        is_exists = select_mydb(sql, val)[0][0] # ture: 1 / false: 0 반환

        if is_exists: # 해당 URL이 있으면 패스 
            continue
Example #42
0
def get(spell_id):
    session = HTMLSession()
    r = session.get("https://cn.wowhead.com/spell={}".format(spell_id))
    en_url = r.html.find('link[hreflang="en"]', first=True).attrs['href']
    zh_url = r.url
    return Spell(spell_id, parse(en_url), parse(zh_url))
Example #43
0
import time
from requests_html import HTMLSession

starttime = time.time()

# Loop para obtenção dos dados
while True:
    print(time.strftime('%H:%M:%S', time.localtime()))

    # Definição da sessão para o web scraping
    url = 'https://br.investing.com/equities/magaz-luiza-on-nm-historical-data'

    session = HTMLSession()
    r = session.get(url).html
    data = r.find('#results_box', first=True).text.split()

    # Criação das listas para armazenar os dados
    dados_diario = []
    dados_tam = len(data)
    linha = []

    # Obtenção dos dados apenas na primeira linha da tabela
    i = 7
    while i < 14:
        Data_dh = data[i]
        linha.append(Data_dh)
        i = i + 1

        Abertura = data[i]
        linha.append(Abertura)
        i = i + 1
    def scrap(self, country_from, country_to, link, file, time):
        """

        :type country_from: object
        """
        session = HTMLSession()
        r = session.get(link)

        tp = r.html.find('.tp')
        skad = r.html.find('.from')
        cena = r.html.find('.legPrice')
        # list_of_variables = [".date", ".to", ".from", ".legPrice", ".time," ,".tp", "p"]
        data = r.html.find('.date')
        data = data[3:]
        to = r.html.find('.to')
        change = r.html.find('.durcha')
        przewoznik = r.html.find('.airline')

        scrap_date = time.strftime("%Y-%m-%d")
        scrap_time = time.strftime("%H:%M")

        # with open(file, "a") as f:

        # myfile.write("appended text")
        # self.write_data(myfile)
        licznik = 0
        ii = 0
        licz_ceny = 0
        i = 0
        id_podrozy = self.flight_id
        ThereBack = ['There', "Back"]

        # print(len(change))

        print('zaczynam zapisywanie dla.. ' + country_from + " - " + country_to)

        while i + ii < len(change):
            przes = 0

            print(change[i + ii].full_text[9:11])

            print(i + ii)
            print(len(change))

            try:
                while change[i + ii].full_text[9:11] != 'no':
                    # print('if 1 dla.. ' + country)

                    przes = 1

                    file.write(scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str(
                        id_podrozy) + ";" + \
                               str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str(
                        przewoznik[licz_ceny].text) + ";" + str(
                        przes) + ";" + str(cena[licz_ceny].text[1:]) + ";" + str(
                        skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \
                               ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str(
                        to[licznik + 1].text[:5]) + ";" + str(to[licznik + 1].text[6:]) + "\n")
                    licz_ceny += 1
                    licznik += 1
                    file.write(scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str(
                        id_podrozy) + ";" + \
                               str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str(
                        przewoznik[licz_ceny].text) + ";" + str(
                        przes) + ";" + str(cena[licz_ceny].text[1:]) + ";" + str(
                        skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \
                               ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str(
                        to[licznik - 1].text[:5]) + ";" + str(to[licznik - 1].text[6:]) + "\n")
                    licz_ceny += 1
                    licznik += 2
                    ii += 1

                    if (i + ii) % 2 == 0:
                        id_podrozy += 1

            except IndexError:
                print("Something went wrong")

                # if ii%3==0:
                # id_podrozy += 1

            # ?? dodane ify żeby się nie wywalało, ale teraz część się w ogóle nie zapisuje

            if len(to) > licznik:
                # if len(to[licznik].text)<7 and len(skad[licznik + 1].text.replace('\xa0', ' '))<10:

                file.write(
                    scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str(
                        id_podrozy) + ";" + \
                    str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str(
                        przewoznik[licz_ceny].text) + ";" + str(przes) + ";" + str(
                        cena[licz_ceny].text[1:]) + ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \
                    ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str(
                        to[licznik].text[:5]) + ";" + str(to[licznik].text[6:]) + "\n")

                # id_podrozy += 1

            licz_ceny += 1
            licznik += 2
            i += 1

            if (i + ii) % 2 == 0:
                id_podrozy += 1

            print('-----')

            # print("----2------")
            # print(licz_ceny)
            # print(licznik)
            # print(i)

        self.flight_id = id_podrozy
        file.flush()
        print(country_from + " - " + country_to + " skończone!")
Example #45
0
from requests_html import HTMLSession

root = 'https://www.wiki-wiki.top/baike-%E8%A5%BF%E6%B8%B8%E8%AE%B0%E8%A7%92%E8%89%B2%E5%88%97%E8%A1%A8'

all_entities = []

print('visiting...')
url = root
session = HTMLSession()
response = session.get(url)
output = response.html.find('div.mw-parser-output', first=True)
print(output.text)
a_list = response.html.find('dt')

# cur_ents = []
# for a in a_list:
#     if a.attrs.get('class', '') == ('category-page__member-link', ):
#         cur_ents.append(a.attrs['title'])
# for t in cur_ents:
#     if 'Template' in t: continue
#     if 'Category' in t:
#         if t not in have_seen_categories:
#             current_category.append(t)
#             have_seen_categories.add(t)
#     else:
#         all_entities.append(t)

all_entities = sorted(list(set(all_entities)))
with open('entities_wiki1.txt', 'w', encoding='utf-8') as f:
    for t in all_entities:
        f.write(t.strip() + '\n')
Example #46
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 13 13:00:05 2020

@author: srira
"""

from requests_html import HTMLSession

session = HTMLSession()
r = session.get("https://en.wikipedia.org/wiki/Association_football")
r.status_code

#print(r.html)

urls = r.html.links
#print(urls)

absoluteurls = r.html.absolute_links
#print(absoluteurls)

type(absoluteurls)

links = r.html.find('a')
#print( links)

onlywikipedialinks = r.html.find('a', containing="wikipedia")
#print(onlywikipedialinks)

textinwikipedialink = [
    wikipedialink.text for wikipedialink in onlywikipedialinks
Example #47
0
import pandas as pd

session = HTMLSession()
major_need_crawl = [
    'B1', 'B2', 'B3', 'B5', 'C1', 'C2', 'C3', 'C4', 'F8', 'E1', 'E3', 'E4',
    'E5', 'E6', 'E8', 'E9', 'F0', 'F1', 'F2', 'F4', 'F5', 'F6', 'F9', 'H1',
    'H2', 'H3', 'H4', 'H5', 'I2', 'I3', 'I5', 'I6', 'I7', 'I8', 'D2', 'D4',
    'D5', 'D8', 'E2', 'F7', 'E7', 'F2', 'F3', 'C5', 'C6'
]
all_major = {}
major = []
course = []

for k in major_need_crawl:
    r = session.get(
        'http://course-query.acad.ncku.edu.tw/qry/qry001.php?dept_no={}'.
        format(k))
    r.encoding = 'utf-8'
    res = r.html.find('thead tr th')

    for i in range(1, 5):
        resp = r.html.find('.course_y{} td'.format(i))
        for j in range(len(resp)):
            if (j + 1) % len(res) == 0 and j != 0:

                course.append(str(resp[j].text))
                major.append(course.copy())
                course.clear()
            else:
                course.append(str(resp[j].text))
Example #48
0
# get the target url form the command line arguments
url = str(sys.argv[1])

# for any given fanfiction.net link only the first 5 parts matter for location a story
# https://www.fanfiction.net/s/8897431/1/Child-of-the-Storm				--> Original
# https: | | www.fanfiction.net | s | 8897431 | 1 | Child-of-the-Storm	--> Post Split (| represent the places where the split occurred
# https://www.fanfiction.net/s/8897431 									--> rebuilt URL
split = url.split('/')
main = split[0] + "/" + split[1] + '/' + split[2] + '/' + split[
    3] + '/' + split[4]

# Create the session object
session = HTMLSession()

# send a GET HTTP request
pageHTML = session.get(main)

# render() load the html into a headless pupeteer browser and executes any javascript on the page (this is important because parts of the page a dynamically loaded using JQuery)
pageHTML.html.render()

# build the BeautifulSoup object based on the now rendered HTML
pageSoup = soup(pageHTML.html.html, "html.parser")

# create the directory where the story will live based on the name of the story
dirName = pageSoup.find('b', {'class': 'xcontrast_txt'}).text
os.mkdir(dirName)

# create a link to a local copy of the stylesheet that will be retrieved later
styleSheets = '<link rel="stylesheet" href="./xss26.css">'

# find the length of the storyy based on the existance of a known select tag. If the tag does not exists the story has 1 chapter only
def UploadImageAsset(client, url, image_ref_on_file, image_name, width, height, mode):
  """Uploads the image from the specified url.
  Args:
    client: An AdWordsClient instance.
    url: The image URL.
  Returns:
    The ID of the uploaded image.
  """
  # Initialize appropriate service.
  asset_service = client.GetService('AssetService', version='v201809')

  # Download the image.
  headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
  session__ = HTMLSession()
  """ image_request = session__.get(url, headers=headers, verify=True)
  #print("URL: "+ url)
  print(image_request.content)
  print(image_request.html) """
  print(url)
  tab = url.split('&')
  #print(type(url))
  image_request = session__.get(tab[0], headers=headers, verify=True)
  #print(tab[0])
  #image_asset = BytesIO(urlopen(tab[0]).read())
  image_asset = image_request.content
  #print(image_asset)

  # Create the image asset.
  try:
    source = tinify.tinify.tinify.from_url(url)
    #print(source)
    resized_image = source.resize(method=mode, width=int(width), height=int(height))
    data = resized_image.to_file(image_ref_on_file)
    #print(sys.getsizeof(data))
    #print(data)
  except:
    try:
      source = tinify.tinify.tinify.from_url(url)
      print(source)
      resized_image = source.resize(method=mode, width=int(width), height=int(height))
      data = resized_image.to_file(image_ref_on_file)
      print(sys.getsizeof(data))
      #print(data)
    except Exception as e:
      print(e)
  print(image_name)
  file_url = url_for('uploaded_file', filename=image_name, _external=True)
  image_asset = {
      'xsi_type': 'ImageAsset',
      'imageData': urlopen(file_url).read(),
      # This field is optional, and if provided should be unique.
      # 'assetName': 'Image asset ' + str(uuid.uuid4()),
  }

  # Create the operation.
  operation = {
      'operator': 'ADD',
      'operand': image_asset
  }

  # Create the asset and return the ID.
  result = asset_service.mutate([operation])

  
  return result['value'][0]['assetId']
#!/usr/bin/env python
# coding=utf-8
# author: zengyuetian
# 获得链家网城区信息


from requests_html import HTMLSession

if __name__ == '__main__':
    session = HTMLSession()
    # 获取上海链家小区栏目页
    r = session.get('https://sh.lianjia.com/xiaoqu/')
    # 获得上海链家区县列表
    elements = r.html.xpath('/html/body/div[3]/div[1]/dl[2]/dd/div/div/a')
    # 区县英文和中文名列表
    en_names = list()
    ch_names = list()

    # element html代码 形如 <a href="/xiaoqu/pudong/" title="上海浦东小区二手房 ">浦东</a>
    for element in elements:
        for link in element.absolute_links:  # 遍历链接set
            en_names.append(link.split('/')[-2])
            ch_names.append(element.text)

    # 打印区县英文和中文名列表
    for index, name in enumerate(en_names):
        print(name, ch_names[index])

    """
        pudong 浦东
        minhang 闵行
Example #51
0
from requests_html import HTMLSession

session = HTMLSession()

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}

r = session.get(
    'https://tembiapo.mopc.gov.py/obras/42-pavimentacion-de-tramos-alimentadores-de-la-red-vial-nacional-paquete-n-1-lote-2',
    verify=False,
    headers=headers)

table = r.html.find('#info', first=True)

columns = []
for e in table.find('label'):
    columns.append(e.text)

cells = []
for e in urls:
    columns.append(e.text)

print(columns)
print(cells)