Python bの例、bs4.b Pythonの例

コード例 #1

1

ファイルを表示

ファイル: scrap.py プロジェクト: gituajames/covid19_vis

def table():
    r = requests.get('https://www.worldometers.info/coronavirus/#countries')
    print(r.status_code)

    soup = b(r.text, 'lxml')

    # find our table
    table = soup.find('table', id='main_table_countries_today')
    tbody = table.find('tbody')
    #     print(tbody)

    table_row = tbody.find_all('tr')
    all_rows = []
    for tr in table_row:
        td = tr.find_all('td')
        row = [i.text.replace('\n', ' ').strip() for i in td]
        all_rows.append(row)

    df = pd.DataFrame(all_rows,
                      columns=[
                          'country', 'total_cases', 'new_cases',
                          'total_deaths', 'new_deaths', 'total_recovered',
                          'active', 'serirous', '1', '2', '3', '4', '5'
                      ])
    #     print(df.head())

    df.drop(index=[0, 1, 2, 3, 4, 5, 6, 7], inplace=True)
    df.drop(columns=['1', '2', '3', '4'], inplace=True)

    copy_df = df.copy()
    #     print(copy_df.head())

    copy_df['total_recovered'] = copy_df['total_recovered'].str.replace(
        'N/A', '0')
    copy_df['new_cases'] = copy_df['new_cases'].str.replace('+', '')
    copy_df['new_deaths'] = copy_df['new_deaths'].str.replace('+', '')
    #     print(copy_df.head())

    copy_df['total_cases'] = copy_df['total_cases'].str.replace(',', '')
    copy_df['new_cases'] = copy_df['new_cases'].str.replace(',', '')
    copy_df['total_deaths'] = copy_df['total_deaths'].str.replace(',', '')
    copy_df['total_recovered'] = copy_df['total_recovered'].str.replace(
        ',', '')
    copy_df['active'] = copy_df['active'].str.replace(',', '')
    copy_df['serirous'] = copy_df['serirous'].str.replace(',', '')
    #     print(copy_df.head())

    copy_df['total_cases'] = pd.to_numeric(copy_df['total_cases'])
    copy_df['new_cases'] = pd.to_numeric(copy_df['new_cases'])
    copy_df['total_deaths'] = pd.to_numeric(copy_df['total_deaths'])
    copy_df['new_deaths'] = pd.to_numeric(copy_df['new_deaths'])
    copy_df['total_recovered'] = pd.to_numeric(copy_df['total_recovered'])
    copy_df['active'] = pd.to_numeric(copy_df['active'])
    copy_df['serirous'] = pd.to_numeric(copy_df['serirous'])
    #     print(copy_df.head())

    copy_df.fillna(0, inplace=True)
    #     print(copy_df.head())

    return copy_df

コード例 #2

0

ファイルを表示

 def get_num_flw(self, flag, user):
     if flag == "followers":
         # Followers
         flw = WebDriverWait(self.driver, 10).until(
             EC.presence_of_element_located(
                 (By.XPATH,
                  f"//a[contains(@href, \'/{user}/followers/\')]")))
         # flw = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#react-root > section > main')))
         sflw = b(flw.get_attribute('innerHTML'), 'html.parser')
         followers = sflw.findAll('span', {'class': 'g47SY'})
         f = followers[0].getText().replace(',', '')
     elif flag == "following":
         # Following
         flw = WebDriverWait(self.driver, 10).until(
             EC.presence_of_element_located(
                 (By.XPATH,
                  f"//a[contains(@href, \'/{user}/following/\')]")))
         sflw = b(flw.get_attribute('innerHTML'), 'html.parser')
         followers = sflw.findAll('span', {'class': 'g47SY'})
         f = followers[0].getText().replace(',', '')
     if 'k' in f:
         f = float(f[:-1]) * 10**3
         return f
     elif 'm' in f:
         f = float(f[:-1]) * 10**6
         return f
     else:
         return float(f)

コード例 #3

0

ファイルを表示

def MCQScraper(link):
      page_title=link.split("/")[3]
      heading="<!DOCTYPE html>\n<html>\n<head>\n<title>"+page_title+"</title>\n</head>\n<link rel='stylesheet' type='text/css' href='./style.css'>\n<body>";

      file=open("../cse/"+page_title+".html","w")
      file.write(heading)

      p=r.get(link).text
      soup=b(p,features="html.parser")

      links=[]
      
      table_data=soup.find_all("td");
      
      for data in table_data:
      	href=data.a["href"]
      	# text=data.a.text;
      	if(href):
      		#title.append(text)
      		links.append(href)
      
      k=0
      
      for i in links:
      	p=r.get(i).text
      	soup=b(p,features="html.parser")
      	questions=soup.find("div",{"class":"entry-content","itemprop":"text"})
      	file.write(str(questions))

      ending="</body>\n<script src='./script.js'></script>\n</html>"
      file.write(ending)

コード例 #4

0

ファイルを表示

def queued_data_loop(session):
    """
    Main data loop once logged in, gets list of ids and initializes a queue to iterate through and gathers data.
    :param session: User Login info
    """
    while not id_queue.empty():
        try:  #A more broad exception to try and encapsulate if anything happens during the request process

            print('Ticket Number: ', id_queue.qsize())
            quoteID = id_queue.get()
            print('getting id: ', str(quoteID))
            r = session.get(ENGINEERING_DETAILS + str(quoteID), verify=False)
            soup = b(r.text, 'html.parser')
            table = soup.find_all('a')
            dateTable = soup.find_all('caption')
            dateTableList = []
            soupList = []
            for link in table:
                soupList.append(link)
            quotes = findQuotes(soupList)

            orderMatch = r'(^[a-zA-Z])(\w+)'

            #Determine the quarter of ticket
            for item in dateTable:
                dateTableList.append(item)
            date = findDate(dateTableList)

            if not quotes:
                print("No quotes...Next ticket!")
                id_queue.task_done()
            else:
                for quote in quotes:  # get rack from each quote
                    #TODO test this area
                    if re.search(orderMatch, str(quote)):
                        r = session.get(
                            ENGINEERING_ORDERS +
                            str(re.search(orderMatch, str(quote)).group(2)),
                            verify=False)
                        # print(ENGINEERING_ORDERS + str(re.search(orderMatch, str(quote)).group(2)))
                    else:
                        r = session.get(ENGINEERING_QUOTES + str(quote),
                                        verify=False)
                    quotehtml = b(r.text, 'html.parser')
                    quotelinks = quotehtml.find_all('tr')
                    quoteDetails = []
                    for link in quotelinks:
                        quoteDetails.append(link)
                    findRack(quoteDetails, date, session,
                             quoteID)  # list of racks from each quote
                    # print("Racks in dict: ", rackDict)
                    # print(racks)
                id_queue.task_done()

        #TODO create more informative error handeling
        except Exception as e:
            import logging
            logging.exception('Something Happened...')
            id_queue.task_done()
    print('Ended at: ', id_queue.qsize())

コード例 #5

0

ファイルを表示

def shopSearch(request, page):
    b_url = "https://www.n11.com/arama?q="
    url_sep = "&pg="
    url = b_url + request + url_sep + page
    html = urllib2.urlopen(url).read()
    soup = b(html, "html.parser")
    for post in soup.findAll("li", {"class": "column"}):
        try:
            item = post.findAll("a", {"class": "plink"})[0]
            title = item['title']
            price = post.findAll("ins")[0].text.replace(" ",
                                                        "").replace("\n", "")
            link = item['href']
            sellerName = post.findAll("span",
                                      {"class": "sallerName"})[0].text.replace(
                                          " ", "").replace("\n", "")
            sellerPoint = post.findAll("span",
                                       {"class": "point"})[0].text.replace(
                                           " ", "").replace("\n", "")
            print(title)
            print(price + "\t Seller Name: " + sellerName + "\t Rating: " +
                  sellerPoint)
            print(link + "\n")
        except:
            pass

コード例 #6

0

ファイルを表示

def cheapest(request, page):
    x = []
    b_url = "https://www.cimri.com/arama?"
    url_sep = "page="
    seps2 = "&q="
    url = b_url + url_sep + page + seps2 + request
    html = urllib2.urlopen(url).read()
    soup = b(html, "html.parser")
    for post in soup.findAll("div", {"id": "cimri-product"}):
        try:

            item = post.findAll("h2", {"class": "product-title"})[0].text
            link = post.findAll("a")[0]['href']
            for markets in post.findAll("div", {"class": "tag"}):
                market = markets.text
            for prices in post.findAll("a", {"class": "s14oa9nh-0 gwkxYt"}):
                x.append(
                    prices.text.replace("com", "com : ").replace(".tr", ""))
            print(item)
            print(x[0])
            print(x[1])
            print("https://www.cimri.com/" + link + "\n")
            x = []
        except:
            pass

コード例 #7

0

ファイルを表示

def news(grabyear, grabcountry):

    import requests
    from bs4 import BeautifulSoup as b
    import pandas as pd
    import webbrowser

    Country = grabcountry
    Before = grabyear
    url = f"https://www.google.co.in/search?q=+{Country}+co2+emissions+scholarly+articles+before:+{Before}"
    print(url)
    response = requests.get(url)

    soup = b(response.text, "lxml")
    articles = []
    r = soup.find_all('div', attrs={'class': 'BNeawe vvjwJb AP7Wnd'})
    for i in range(len(r)):
        articles.append(r[i].text)

    urls = soup.find_all('div', attrs={'class': 'kCrYT'})
    Links = []
    for link in urls:
        href = link.find('a')
        try:
            raw_website = href.get('href')
            clean_web = raw_website[7:]
            Links.append(clean_web)
        except:
            continue
    newsdata = [{"articles": articles, "links": Links}]
    return jsonify(newsdata)

コード例 #8

0

ファイルを表示

ファイル: collect_relationships.py プロジェクト: saadshahbaz/Data-Science---Web-scraping

def extract_relationships(cache, url,person_url):
    """
    Extract all the relationships of the mensioned celev
    """
    relationships = []
    new_url = "https://www.whosdatedwho.com/dating/"+url
    filename = get_url_content(new_url,cache)
    soup = b(open(filename, 'r'), 'html.parser')

    ##grab the h4 class
    status_h4 = soup.find('h4', 'ff-auto-status') #always the tag type
    
    #grab the net sibling
    key_div = status_h4.next_sibling

    candidate = key_div.find_all('a')
    
    #we need all that start with dating
    relationships.extend(find_candidate(candidate, person_url))

    ##get all prior relationships
    prev_h4 = soup.find('h4', 'ff-auto-relationships')

    div_past_relationships = prev_h4.next_sibling
    while div_past_relationships is not None and div_past_relationships.name=='p':
        candidate = div_past_relationships.find_all('a')
        relationships.extend(find_candidate(candidate, person_url))
        div_past_relationships= div_past_relationships.next_sibling

    return relationships

コード例 #9

0

ファイルを表示

def getDeckCards(backUrl):
    frontUrl = "https://www.mtggoldfish.com"
    url = frontUrl + backUrl
    html = urllib.request.urlopen(url).read()
    soup = b(html, 'html.parser')

    cards = soup.find_all('td', {'class': 'deck-col-card'})
    #print(cards)
    cardList = []
    for card in cards:
        cardList.append(card.text[1:-1])
        #print('the card:',card.text)
    #print(cardList)

    cards = soup.find_all('td', {'class': 'deck-col-qty'})
    #print(cards)
    cardQty = []
    for card in cards:
        cardQty.append(card.text[1:-1])
        #print('the card:',card.text)
    #print(cardQty)
    cardDic = {}
    for x in range(len(cardList)):
        cardDic[cardList[x]] = int(cardQty[x])
    #print(cardDic)
    for key, val in cardDic.items():
        if key not in addCards:
            addCards[key] = val
        else:
            addCards[key] = addCards[key] + val

コード例 #10

0

ファイルを表示

ファイル: baidutieba.py プロジェクト: xxpasswd/python-core-programming

def get_content(url):
    '''分析网页内容'''

    # 存储帖子信息的字典
    comments = []

    html = get_url_html(url)
    soup = b(html, 'lxml')

    li_tags = soup.find_all('li', class_=" j_thread_list clearfix")
    for li in li_tags:
        # 初始化一个字典，存储文章信息
        comment = {}
        # 使用try，防止爬虫出错而停止运行
        try:
            # 获取网页内容
            comment['title'] = li.find('a', attrs={
                'class': 'j_th_tit '
            }).text.strip()
            comment['link'] = URL + li.find('a', class_="j_th_tit ")['href']
            comment['author'] = li.find('span',
                                        attrs={
                                            'class': 'frs-author-name-wrap'
                                        }).text.strip()
            comment['reply'] = li.find('div',
                                       attrs={
                                           'class':
                                           'col2_left j_threadlist_li_left'
                                       }).text.strip()
            comments.append(comment)
        except:
            print("get_content,error")
    return comments

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: arnavmundkur95/NicknameGenerator

def getConceptDerivedTerms(word):
    searchTerm = word
    link = 'http://conceptnet.io/c/en/'
    link = link + searchTerm
    http_pool = url.connection_from_url(link)
    r = http_pool.urlopen('GET', link)
    http_pool.close()
    html = r.data.decode('utf-8')
    soup = b(html, features="html5lib")

    divs = soup.findAll("a")
    div = []
    candies = []

    for d in divs:
        if d.contents[0] == 'Derived terms':
            div = d.find_parent().find_parent()

    if len(div) > 0:
        links = div.findAll("a")
        for k in links:
            candies.append(n.word_tokenize(k.contents[0]))

        del (candies[0])

        c = []

        for k in candies:
            if len(k) > 1:
                counter = 0
                s = ''
                for j in k:
                    if len(j) > 2:
                        counter += 1
                        s = s + ' ' + j
                if counter == len(k):
                    c.append(s)

            elif len(k[0]) > 2:
                c.append(k[0])

        candies = c
        c = []

        for k in candies:
            if not k == searchTerm:
                c.append(k)
        candies = c

        for k in range(len(candies)):
            temp = n.word_tokenize(candies[k])
            if len(temp) > 1:
                s = ''
                for j in temp:
                    s = s + j + ' '
                candies[k] = s
            else:
                candies[k] = temp[0]

    return candies

コード例 #12

0

ファイルを表示

def grabProxiesHttp():
    site = 'https://free-proxy-list.net/'
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = urllib.Request(site, headers=hdr)  #sending requests with headers
    url = urllib.urlopen(req).read()  #opening and reading the source code
    html = b(url, "lxml")  #structuring the source code in proper format
    rows = html.findAll("tr")  #finding all rows in the table if any.
    proxies = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text for ele in cols]
        try:
            ipaddr = cols[
                0]  #ipAddress which presents in the first element of cols list
            portNum = cols[
                1]  #portNum which presents in the second element of cols list
            proxy = ipaddr + ":" + portNum  #concatinating both ip and port
            portName = cols[6]  #portName variable result will be yes / No
            if portName == "no":
                proxies.append(str(proxy))
                #proxies.append(str(proxy)+":http") #if yes then it appends the proxy with https
            #else:
            #proxies.append(str(proxy)+":https") #if no then it appends the proxy with http
        except:
            pass

    #for j in proxies:
    #   print(j)
    return proxies

コード例 #13

0

ファイルを表示

ファイル: porn-filter.py プロジェクト: SRMSE/keyword-crawler

def main(line):
	try:
		line = line.split(",")[1].strip()
		json = soup(url%line)
	#print domain
	domain_soup = b(requests.get(domain).content,"lxml").find("body").text
	
	put(line[i].split(",")[1].strip()+ " at line " + str([i+1]), "INFO")
	json_data = json.loads(domain_soup.split("process(")[1].split(")")[0])
	try:
		json_data = json_data[json_data.keys()[0]]['categories'].keys()
		
		if ('401' in json_data) or ('402' in json_data):
			put("ADULT!!!!!!!", "WARNING")
			print "---------------------------------------------------------------------"
			
		else:
			put("SAFE FOR WORK", "SUCCESS")
			file.write(line[i].split(",")[1].strip()+"\n")
			print "------------------------------------------------------------------------"

	except Exception as e:
		put("DOES NOT CONTAIN INFO", "FAIL")
		print "------------------------------------------------------------------------------------"
	i += 1

コード例 #14

0

ファイルを表示

ファイル: scraping.py プロジェクト: GuillaumePl/top_14_prediction

    def __init__(self,
                 day_url,
                 season_id,
                 current_day,
                 scraping_parameters_path='scraping/scraping_parameters.yml'):
        self.season_id = season_id
        self.current_day = current_day
        self.scraping_parameters_path = scraping_parameters_path
        self.scraping_parameters = ScrapingParameters(
            self.scraping_parameters_path)
        self.day_results_list = []

        # Access the current day data, using current day url
        day_http_request = requests.get(day_url)
        day_page = day_http_request.content
        day_soup = b(day_page, 'html.parser')
        day_container = day_soup.select('div.day-results-table')

        for html_match_info in day_container[0].select('tr.info-line.after'):
            current_match = Match(html_match_info, self.season_id,
                                  self.current_day)
            self.day_results_list.append(current_match.match_results)

        # for match_info in day_container[0].select('tr.info-line.after.table-hr'):
        #     current_match = Match(match_info, season_id, current_day)
        #     self.day_results.append(current_match.match_results)

        self.day_results = pd.DataFrame(
            self.day_results_list,
            columns=self.scraping_parameters.saison_cols)

コード例 #15

0

ファイルを表示

ファイル: scraping.py プロジェクト: GuillaumePl/top_14_prediction

    def __init__(self,
                 url,
                 scraping_parameters_path='scraping/scraping_parameters.yml'):
        self.scraping_parameters_path = scraping_parameters_path
        self.scraping_parameters = ScrapingParameters(
            self.scraping_parameters_path)
        self.season_results = pd.DataFrame(
            columns=self.scraping_parameters.saison_cols)
        self.days = {}
        self.season_id = None

        # Access the current season data, using current season url
        season_http_request = requests.get(url)
        season_page = season_http_request.content
        self.season_soup = b(season_page, 'html.parser')

        # Get the urls to the different Days of the current season (stored in a dictionary) \
        # and the identification of the current season.
        self.season_id, self.days = self.get_days_url()

        for day in self.days:

            current_day = Day(
                self.scraping_parameters.fixe + self.days[day],
                self.season_id,
                day,
                self.scraping_parameters_path,
            )

            self.season_results = self.season_results.append(
                current_day.day_results)

コード例 #16

0

ファイルを表示

ファイル: TechScrap.py プロジェクト: programmerraja/TechScrap

 def houseofbots(s):
     s.text.delete("1.0", tkinter.END)
     s.enter_webno.delete(0, tkinter.END)
     try:
         s.p = requests.get("https://www.houseofbots.com").text
         s.soup = b(s.p, features="html.parser")
     except:
         messagebox.showinfo(
             "ERROR", "Plse Make Sure You Have Internet Connection ")
     length = 1
     s.heading1 = []
     s.link2 = []
     s.text.insert(tkinter.INSERT,
                   "\t\tThe contents  in house of bots are \n \n".upper())
     for i in s.soup.find_all("li"):
         if i.find("h4") != None:
             s.heading1.append(i.find("h4").text)
             s.text.insert(tkinter.INSERT,
                           str(length) + "." + i.find("h4").text + "\n\n")
             s.link2.append(i.find("a").get("href"))
             length += 1
     s.sumbit_button = tkinter.Button(s.root,
                                      text="Open",
                                      command=lambda: s.dispaly(s.link2),
                                      fg="red",
                                      relief="sunken")
     s.sumbit_button.place(x=300, y=620, height=20, width=70)

コード例 #17

0

ファイルを表示

def get_content_from_images(html):
    content = ''
    soup = b(html)
    text_class = soup.find_all('div', {'class': 'rc'})
    for text in text_class:
        content = content + text.text
    return content

コード例 #18

0

ファイルを表示

ファイル: brand24_url_text.py プロジェクト: ADITYA727/HeLp_for_ML

def get_content(u,response):

    if 'https://twitter.com' in u:
        # url other than image
        # getting content
        #print("-----------------------twitter status---------------------------")
        html = response.text
        soup = b(html)
        if ('i/moments' in u):
            text = ''
            moments_text = soup.find('div', {'class': 'MomentCapsuleCover-details'})
            if moments_text:
                text = moments_text.text
        else:
            tweet_status_text = soup.find_all('p', {
                'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'})
            tweet_text = ''
            for txt in tweet_status_text:
                if txt:
                    tweet_text = tweet_text + txt.text
            text = tweet_text
    else:
        #print('------------------------other content--------------------------')

        html = response.text
        text = get_content_from_urls(html)
    return text

コード例 #19

0

ファイルを表示

ファイル: getpages.py プロジェクト: chandan61308/InstagramBot

 def get_followers(self):
     time.sleep(2)
     flw_btn = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                                                                    "#react-root > section > main > div > header > section > ul > li:nth-child(2) > a")))
     flw_btn.click()
     time.sleep(3)
     self.popup = WebDriverWait(self.driver, 10).until(
         EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div[2]")))
     for h in range(11):
         time.sleep(1)
         print('scrolling')
         print(h)
         print('arguments[0].scrollTop = arguments[0].scrollHeight/{}'.format(str(11 - h)))
         self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight/{}'.format(str(11 - h)),
                                    self.popup)
         if h == 5:
             break
     for i in range(40):
         time.sleep(2)
         self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', self.popup)
     self.popup = WebDriverWait(self.driver, 10).until(
         EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div[2]")))
     b_popup = b(self.popup.get_attribute('innerHTML'), 'html.parser')
     for p in b_popup.findAll('li', {'class': 'wo9IH'}):
         try:
             hlink = p.find_all('a')[0]['href']
             print(hlink)
             if 'div' in hlink:
                 print('div found not adding to list')
             else:
                 self.hrefs.append(hlink)
         except:
             pass
     return self.hrefs

コード例 #20

0

ファイルを表示

ファイル: a1.py プロジェクト: SHAILESHSHIVA/internship

    def cnt(self):
        with open(self.path) as f2:
            sp = b(f2, 'html.parser')
            text = sp.get_text().lower()

        print 'Enter "no" to exit'
        while True:
            w = raw_input('ENTER THE WORD WANT TO SEARCH IN THE FILE\n')
            if w != 'no':
                cnt = text.count(w)
                print '"%s occured %d times in the file"' % (w, cnt)
                self.lst.append(w)
                continue

            else:
                print 'want to see your searches'
                res = raw_input()
                if res != 'no':
                    c = Counter(self.lst)
                    print 'Top 5 words searched by you are '
                    for i, j in c.most_common(5):
                        print '"%s"::"%d"times' % (i, j)

                with open('data.csv', 'ab') as d:
                    self.header = ['words,date,time']
                    dt = csv.writer(d, delimiter=' ')
                    if self.empty == 0:
                        dt.writerow(self.header)
                    for i in self.lst:
                        dt.writerow([i + ',', self.d, self.t])
                break

コード例 #21

0

ファイルを表示

ファイル: scrap_seasons.py プロジェクト: GuillaumePl/top_14_prediction

    def __init__(self, fixed_url, season_id, current_day, season_url, day_url):
        self.season_url = season_url
        self.day_url = day_url
        self.complete_day_url = fixed_url + day_url
        self.season_id = season_id
        self.current_day = current_day
        self.day_results = pd.DataFrame(columns=MATCHES_RESULTS_COLS)

        # Access the current day raw_data, using current day url
        day_http_request = requests.get(self.complete_day_url)
        day_page = day_http_request.content
        day_soup = b(day_page, 'html.parser')
        day_container = day_soup.select('div.day-results-table')

        for html_match_info in day_container[0].select('tr.info-line.after'):
            current_match = Match(html_match_info, self.season_id,
                                  self.current_day, self.season_url,
                                  self.day_url)

            self.day_results = self.day_results.append(
                current_match.match_results)

            del current_match

        self.day_results.drop_duplicates(
            inplace=True,
            subset=['season_id', 'season_day', 'team_dom'],
            keep='last')

コード例 #22

0

ファイルを表示

 def liked_post(self):
     #If this returns tue then is is most likely an account with 0 posts
     try:
         p_text = WebDriverWait(self.driver, 10).until(
             EC.presence_of_element_located((
                 By.CSS_SELECTOR,
                 '#react-root > section > main > div > header > section > ul > li:nth-child(1) > span'
             )))
         if p_text == '0':
             return False
         else:
             return True
     except:
         return True
     post = self.driver.find_element_by_css_selector(
         '#react-root > section > main > div > div._2z6nI > article > div > div > div:nth-child(1) > div:nth-child(1)'
     )
     html = post.get_attribute('innerHTML')
     h = b(html, 'html.parser')
     href = h.a['href']
     self.driver.get('https://www.instagram.com' + href)
     like_btn = WebDriverWait(self.driver, 10).until(
         EC.presence_of_element_located((
             By.CSS_SELECTOR,
             '#react-root > section > main > div > div.ltEKP > article > div.eo2As > section.ltpMr.Slqrh > span.fr66n > button > div'
         )))
     like_btn.click()

コード例 #23

0

ファイルを表示

def waterdebt(request):
    c = {}
    l = []

    # c['error'] = "Zəhmət olmasa ilk öncə formu doldurub sonra borcu öyrən butonuna basın."

    abkodu = request.POST.get('abkodu')

    if abkodu:
        api = "https://opendata.e-gov.az/api/v1/json/azersu/DebtInfo/{}".format(
            abkodu)

    data = requests.get(api)

    d_d = json.loads(data.text)
    b_d = d_d['Response']['HtmlField']
    soup = b(b_d, 'html.parser')

    if len(soup.find_all('b')) == 0:
        c['error'] = """Abonent kodu ya yanlışdır ya da boş buraxılıb.
		Zəhmət olmasa kodunuzu yoxlayın və yenidən yazın"""

    else:
        for a in soup.find_all('b'):
            l.append(re.sub(r"[<b>,</b>]", "", str(a)))

        c['result'] = True
        c['code'] = "Abonent kodu: " + l[1]
        c['name'] = "Ad: " + l[3]
        c['debt'] = "Borc: " + l[5] + " AZN"

    return render(request, test, c)

コード例 #24

0

ファイルを表示

ファイル: crawller.py プロジェクト: Payhemfoh/personal-projects-compilation

def get_content(url):
    comments = []
    html = get_html(url)
    soup = b(html, 'lxml')
    liTags = soup.find_all('li', attrs={'class': 'j_thread_list clearfix'})

    for li in liTags:
        comment = {}
        try:
            comment['title'] = li.find('a', attrs={
                'class': 'j_th_tit'
            }).text.strip()
            comment['link'] = li.find('a', attrs={'class': 'j_th_tit'})['href']
            comment['name'] = li.find('span',
                                      {'class': 'tb_icon_author'})['title']
            comment['time'] = li.find('span', {
                'class': "pull-right is_show_create_time"
            }).text.strip()
            comment['replyNum'] = li.find(
                'span', {
                    'class': "threadlist_rep_num center_text"
                }).text.strip()
            comments.append(comment)
            print('complete ' + comment["link"])
        except:
            print("Error")
    return comments

コード例 #25

0

ファイルを表示

def get_top_machts():
    html = ur('http://football.kulichki.net/')
    bs = b(html.read())
    div = bs.find('div', {"class": 'col2 inl vtop'}).center.table
    tr_list = div.find_all('tr')
    result = ''
    for item in tr_list[1:]:
        if item.find('span') is not None:

            flag = plus_flag(item.find('span').text)
            plus_flag(flag)
            result = result + flag + '\n'
        else:
            a = item.find('p', {"align": "left"}).text
            a = a.replace('\n', '')
            a = a.replace('  ', ' ')
            matchtime = a[1:a.index('.')]
            timeplus = (int(matchtime[:2]) + 2) % 24
            timeplus = str(timeplus)
            if len(timeplus) == 1:
                timeplus = '0' + timeplus

            matchname = a[a.index('.') + 2:a.rindex('-')]
            result = result + '*' + timeplus + matchtime[
                2:] + '* _' + matchname + '_\n'
    return result

コード例 #26

0

ファイルを表示

    def __init__(self, fixed_url, season_id, season_url, day_url, day):
        #         self.season_url = season_url
        self.day_url = day_url
        self.complete_day_url = fixed_url + day_url
        self.season_id = season_id
        self.season_url = season_url
        self.current_day = day
        self.day_ranking = pd.DataFrame(columns=RANKING_COLS)

        # Access the current day raw_data, using current day url
        day_http_request = requests.get(self.complete_day_url)
        day_page = day_http_request.content
        self.day_soup = b(day_page, features="lxml")
        #         there are 14 teams
        for team in range(14):
            current_team = Team(self.season_id,
                                self.current_day,
                                self.season_url,
                                self.day_url,
                                self.day_soup,
                                team=team)

            self.day_ranking = self.day_ranking.append(
                current_team.team_attributes_list)

            del current_team
        self.day_ranking.drop_duplicates(inplace=True,
                                         subset=['season', 'day', 'equipe'],
                                         keep='last')

コード例 #27

0

ファイルを表示

ファイル: app.py プロジェクト: jhenvi/Enviromental_Indicators_App

def news(grabyear, grabcountry):

    Country = grabcountry
    Before = grabyear
    url = f"https://www.worldbank.org/en/search?q=global+warming+{Country}+{grabyear}&currentTab=1"
    print(url)
    response = requests.get(url)

    soup = b(response.text, "lxml")
    titles = []
    links = []
    descriptions = []

    titles_html = soup.find_all(
        'h4', attrs={'class': 'list-group-item-heading result-header'})
    links_html = soup.find_all(
        'p', attrs={'class': 'list-group-item-text result-link'})
    descriptions_html = soup.find_all(
        'p', attrs={'class': 'list-group-item-text result-description'})
    # print(titles[0].text)
    # print(links[0].text)
    # print(descriptions[0].text)

    for i in range(len(titles_html)):
        titles.append(titles_html[i].text)
        links.append(links_html[i].text)
        descriptions.append(descriptions_html[i].text)
    newsdata = [{
        "articles": titles,
        "links": links,
        "descriptions": descriptions
    }]
    return jsonify(newsdata)

コード例 #28

0

ファイルを表示

ファイル: app.py プロジェクト: devipriyaIT/Extracting-Affiliated-Universities-from-Authors-Addresses

def clg_details():
    if request.method == 'POST':
        text = request.form['mail']
        text = text.split("@")
        query = text[1].split(".")[0] + " college"
        print(query)
        url = []
        #query = "cit college"
        if (query == "sona college"):
            print("SCT|SONA COLLEGE OF TECHNOLOGY IN SALEM,TAMILNADU")
        else:
            for j in search(query, tld="co.in", num=2, stop=2, pause=2):
                url.append(j)
            for i in url:
                html = requests.get(i)
                soup = b(html.content, "html.parser")
                r = soup.find("title")
                print(r.text)
                for q in [
                        'Technology', 'University', 'Institutions', 'College',
                        'Engineering'
                ]:
                    if q in r.text:
                        return render_template("index.html", data=r.text)
                return render_template("index.html",
                                       data="No such college was found")

コード例 #29

0

ファイルを表示

ファイル: cnt.py プロジェクト: SHAILESHSHIVA/internship

def cnt(path,p):
    with open (path) as f2:
        sp = b(f2,'html.parser')
        text = sp.get_text().lower()
    print 'Enter "no" to exit'
    while True:
        w = raw_input('ENTER THE WORD WANT TO SEARCH IN THE FILE\n')
        if w!='no':
            cnt = text.count(w)
            print '"%s occured %d times in the file"'%(w,cnt)
            lst.append(w)
            arr.append([w,d,t])
            continue
        else:
            print 'want to see your searches'
            res = raw_input()
            if res !='no':
                c =  Counter(lst)
                print 'Top 5 words searched by you are '
                for i,j in  c.most_common(5):
                    print '"%s"::"%d"times'%(i,j)
            hdr = ['words','date','time']
            sve(p,hdr,arr)

            break

コード例 #30

0

ファイルを表示

ファイル: TechScrap.py プロジェクト: programmerraja/TechScrap

 def hackernews(s):
     #cleaning the dispaly
     s.text.delete("1.0", tkinter.END)
     s.enter_webno.delete(0, tkinter.END)
     try:
         s.p = requests.get("https://thehackernews.com").text
         s.soup = b(s.p, features="html.parser")
     except:
         messagebox.showinfo(
             "ERROR", "Plse Make Sure You Have Internet Connection ")
     length = 1
     s.heading1 = []
     s.link4 = []
     s.text.insert(tkinter.INSERT,
                   "\t\tThe contents  in hacker news are\n \n".upper())
     for i in s.soup.find_all("h2"):
         s.text.insert(tkinter.INSERT, str(length) + "." + i.text + "\n\n")
         s.heading1.append(i.text)
         length += 1
     length = 0
     for i in s.soup.find_all("a", class_='story-link'):
         s.link4.append(i.get("href"))
         length += 1
     s.sumbit_button = tkinter.Button(s.root,
                                      text="Open",
                                      command=lambda: s.dispaly(s.link4),
                                      fg="red",
                                      relief="sunken")
     s.sumbit_button.place(x=300, y=620, height=20, width=70)

コード例 #31

0

ファイルを表示

ファイル: alexa keyword crawler.py プロジェクト: SRMSE/keyword-crawler

def main(line):
	try:
		tags = soup(b(requests.get((url+line).strip()).content,"lxml"),line)
		dic ={}
		dic[line.strip()] = tags
		put(dic,"SUCCESS")
		keyword.insert(dic, check_keys=False)
		put(line.strip()+" added to MongoClient","ENDC")
	except Exception as e:
		put(e,"FAIL")

コード例 #32

0

ファイルを表示

ファイル: shutterstock.py プロジェクト: varundey/Image_Crawlers

import requests
import re
from bs4 import BeautifulSoup as b
file=open("shutterstock.txt",'a')
url="http://www.shutterstock.com/cat.mhtml?autocomplete_id=&language=en&lang=en&search_source=&safesearch=1&version=llv1&searchterm=&media_type=images"
soup=b(requests.get(url).content)
soup=soup.find("div",{"class":"secondary_links clearfix"})
ul=soup.findAll("ul")
for per_ul in ul:
	li=per_ul.findAll('li')
	for per_li in li:
		category=per_li.find('a').text
		category_link="http://www.shutterstock.com"+per_li.find('a').get('href')
		file.write(str(category)+'\n')
		#print category_link
		file.write(str(category_link)+'\n')
		category_soup=b(requests.get(category_link).content)
		category_soup=str(category_soup.find("div",{"class":"grid_pager"}).findAll("span")[1])
		page=1
		total=int(re.sub(r'[^\d+]',"",category_soup))
		while (page!=total):
			print category+(" page ")+str(page)+" of "+str(total)
			category_page=category_link+"?page=%d&thumb_size=mosaic"%page
			category_page=b(requests.get(category_page).content)
			image=category_page.findAll("span",{"class":"gc_clip"})
			#file.write(str(page)+'\n')
			if (type(image) is list):
				break
			for per_image in image:
				link= per_image.find('img').get('src')
				#print link

コード例 #33

0

ファイルを表示

ファイル: club to nation.py プロジェクト: varundey/lets-football

import requests
from bs4 import BeautifulSoup as b
i=1
dic={}
while True:
	
	soup = b(requests.get("http://www.national-football-teams.com/club/"+str(i)+"/2015_1/Real_Madrid.html").content,"lxml")
	
	club = soup.findAll("div",{"class":"span6"})[2].find('h1')
	
	if club:
		if club.find('small'):
			club.find('small').replaceWith('')
		
#		print club.text.encode('ascii','ignore')
		dic.update({club.text.encode('ascii','ignore'):i})
		print dic
		i+=1

コード例 #34

0

ファイルを表示

ファイル: shutterstock_parse.py プロジェクト: varundey/Image_Crawlers

import MySQLdb
db = MySQLdb.connect("localhost","root","root","TESTDB")
cursor = db.cursor()
import re
import requests
from bs4 import BeautifulSoup as b
url="http://www.shutterstock.com/cat.mhtml?autocomplete_id=&language=en&lang=en&search_source=&safesearch=1&version=llv1&searchterm=&media_type=images"
page_categories=[]
soup=b(requests.get(url).content)
soup=soup.find('div',{"class":"secondary_links clearfix"}).findAll("ul")
for per_ul in soup:
	li=per_ul.findAll("li")
	for per_li in li:
		page_categories.append(str(per_li.text))

file = open('shutterstock.txt', 'r')
i=0


x=file.readlines()
while(i<len(x)):
										#category
	if x[i].replace("\n","") in  page_categories:
		a= str(x[i].replace("\n","").replace("\"",""))
		i+=1
								#category link
	elif re.search(r"http.*.html",x[i]):
		b= str(x[i].replace("\n","").replace("\"",""))
		i+=1
							#link
	elif re.search(r"http.*.jpg",x[i]):

コード例 #35

0

ファイルを表示

ファイル: nation.py プロジェクト: varundey/lets-football

from bs4 import BeautifulSoup as b
import requests
import json
file = open('nation_data.txt','a')
i = 1

dic = {}
DIC={}
l = ["country",'id']

while True:
	country = b(requests.get("http://www.national-football-teams.com/country/"+str(i)+"/2015/Italy.html").content,"lxml").findAll("div",{"class":"span6"})[2].find('h1')

	if country:
		if country.find('small'):
			country.find('small').replaceWith ('')

		dic.update({country.text.encode('utf-8'):i})

		print dic
		i+=1
	else:
		print 546666666666666666666666666666666666666665
		break

json.dump(dic,file)

コード例 #36

0

ファイルを表示

ファイル: dances.py プロジェクト: varundey/general_crawlers

# file=open("dances_test.txt","a")	#todo
import requests
from bs4 import BeautifulSoup as b
import re

dic = {}
url = "https://en.wikipedia.org/wiki/List_of_dances"
uls = b(requests.get(url).content).find("div", {"id": "mw-content-text"}).findAll("ul")
for i in range(2, 47):
    # print uls[46]	#(2-47)

    lis = uls[i].findAll("li")
    for per_li in lis:

        try:
            display_order = ["Dance_name", "Dance_desc", "Dance_link"]
            dance_name = per_li.find("a").get("title").encode("utf-8")
            if "(page does not exist)" in str(dance_name):
                continue
            else:
                dic["Dance_name"] = " " + dance_name + " "

            # print dance_name
            # 		else:

            # 			file.write(dance_name.encode('utf-8'))
            # try:
            dance_link = "https://en.wikipedia.org" + per_li.find("a").get("href")
            dic["Dance_link"] = " " + dance_link + " "
            # 				file.write(dance_link.encode('utf-8'))
            # print dance_link

コード例 #37

0

ファイルを表示

ファイル: archive.py プロジェクト: varundey/Image_Crawlers

import requests
from bs4 import BeautifulSoup as b
main_page=1
file=open("archive.txt","a")
while True:
	web="https://archive.org/details/image?&sort=-downloads&page=%d"%main_page
	soup=b(requests.get(web).content)
	soup=soup.findAll("div",{"class":"collection-title C C2"})
	main_page+=1
	for category in soup:
		category_link=category.find('a').get('href')
		category_name=(category.find('a').find('div').text).encode('utf-8')
		file.write(str(category_name)+'\n')
		file.write(str( "https://archive.org"+category_link)+'\n')
		print category_name
		category_page=1
		while True:
			cat="https://archive.org"+category_link+"?&sort=-downloads&page=%d"%category_page
			cat_soup=b(requests.get(cat).content)
			cat_soup=cat_soup.findAll('div',{"class":"ttl C C2"})
			category_page+=1
			cat_soup=cat_soup[1:]
			if len(cat_soup)==0:
				break
			for img in cat_soup:
				link=img.find('a').get('href')
				desc=(img.find('a').text).encode('utf-8')
				file.write(str(desc)+'\n')
				file.write(str( "https://archive.org"+link)+'\n')
			print category_page-1

コード例 #38

0

ファイルを表示

ファイル: bigstock.py プロジェクト: varundey/Image_Crawlers

import requests
from bs4 import BeautifulSoup as b
import MySQLdb
db = MySQLdb.connect("localhost","root","root","TESTDB")
cursor = db.cursor()
url="http://www.bigstockphoto.com/"


#### getting inside website ####
soup=b(requests.get(url).content,"lxml")

###finding all category links
categories=soup.findAll("div",{"class":"row-fluid"})[4].findAll("a")

####picking one category at a time
for per_category in categories:
	 
	cat="http://www.bigstockphoto.com"+str(per_category.get('href'))
	
	category_name = per_category.text
	print category_name
	
	####just for the lulz
	cat = cat.split("category/")[0]+"?category="+cat.split("category/")[1][:-1]
	cat = cat.split("?")[0]+"?start=0&"+cat.split("?")[1]
	print cat								##category link
	
	####initialising page from 0
	page = 0

	#####iterating over page of category

コード例 #39

0

ファイルを表示

ファイル: porn-filter.py プロジェクト: SRMSE/keyword-crawler

def soup(website):
	soup = b(requests.get(website).content,"lxml").find("pre")
	return soup

コード例 #40

0

ファイルを表示

ファイル: diff.py プロジェクト: varundey/general_crawlers

import requests
import re
#from pymongo import MongoClient
#cl = MongoClient()
#coll = cl["local"]["test2"]
from bs4 import BeautifulSoup as b
url="http://www.differencebetween.net/"
display_order = ["Title1","Title2","TitleLink","Description"]
dic={}
###getting all categories####
cats=b(requests.get(url).content).find("div",{"id":"featured-cats"}).findAll("h5")

for per_cat in cats:

###getting links category wise
	cat_link = per_cat.find('a').get('href')
#	print cat_link
	
###getting page numbers if exists	
	try:
		pages=b(requests.get(cat_link).content).find("span",{"class":"pages"}).text
		pages = re.findall(r'.[\d]',pages)
		pages= int(pages.pop().strip())
		
	except Exception as e:
		print e
	
	for curr_page in range(1,pages+1):
		cat_soup = cat_link+"page/%d/"%curr_page
		print "*************************************************************************************************************"
		print cat_soup

コード例 #41

0

ファイルを表示

ファイル: nation to club.py プロジェクト: varundey/lets-football

import json
file = '/home/varun/Desktop/lets-football/crawler/nation_data.txt'
dic=json.load(open(file))

print "Enter country"
print id
url = "http://www.national-football-teams.com/country/"+str(dic[raw_input()])+"/2015/Italy.html"
import requests
from bs4 import BeautifulSoup as b
#url = "http://www.national-football-teams.com/country/"+str(174)+"/2015/Italy.html"
soup  = b(requests.get(url).content,"lxml")

club= []

table= soup.find("table",{"class":"sortable"}).find('tbody').findAll('tr')
for per_tr in table:
	td = per_tr.findAll('td')
	for i in range(5):
		if i==3:
			continue
		print td[i].text.strip()	
		club.append(td[4].text.strip().encode('ascii', 'ignore'))
	print '---------------------------'
	
total_clubs = len(club)

dic={x:club.count(x) for x in club}
#print dic
print "Below are the team club stats in percent"
for i in dic:
	print i+" \t\t\t| %.3f"%((100.000*dic[i])/total_clubs)+"\t\t|"+str(dic[i])