Esempio n. 1
0
def get_comment(movieid):
    pagenum = 25
    comment_list = []
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/60.0.3112.101 Safari/537.36'
    }
    for i in range(pagenum):
        start = i * 20
        url = ("https://movie.douban.com/subject/" + movieid + "/comments" +
               "?" + "start=" + str(start) + "&limit=20" + "status=P")
        r = requests.get(url, headers)
        websoup = B(r.text, "html.parser")
        div_list = websoup.find("div", id="wrapper").find_all('div',
                                                              class_="comment")
        for link in div_list:
            comment_name = link.find("a", class_="").get_text()
            comment = link.find("span", class_="short").get_text().replace(
                ' ', '').replace('\n', '')
            vote = link.find("span", class_="votes").get_text()
            comment_list.append(comment)
            with open('D:\\python_study\\Spider_Douban\\comment.txt',
                      'a+',
                      encoding="utf-8") as f:
                f.write(comment_name + ": " + comment + " " + vote + "\r\n")
    return comment_list
Esempio n. 2
0
def english(art_album, sing):
    ext = ".html"

    url = "https://www.azlyrics.com/lyrics"

    links = "".join([url, "/", artistic, "/", sing_song, ext])

    page = requests.get(links)

    # Parsing the HTML request
    soup = B(page.content, "html.parser")

    # Finding Artist/Album name
    heading = soup.find("div", class_="lyricsh")
    # Extracting Artist/Album name
    heading_extr = heading.get_text()
    print(heading_extr)

    # Finding the lyrics name
    lyrics_name = soup.find_all("b")[1]
    # Extracting the lyrics name
    name_extr = lyrics_name.get_text()
    print(name_extr)

    # Finding the lyrics
    lyrics = soup.find_all("div")[20]
    # Extracting the lyrics
    lyrics_extr = lyrics.get_text()
    print(lyrics_extr)
def clean(review):
    text = B(review)
    text = text.get_text()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower().split()
    text = [w for w in text if w not in stop]
    return (" ".join(text))
Esempio n. 4
0
    def parse(self, html):

        # 截取出一级的目录
        first_level_tags = [str(_) for _ in html.find_all('h2')]
        all_tags = []
        # 企业背景、股东信息、对外投资信息等每个大模块之间的html(用正则解析两个h2标签中的html)
        for i in range(len(first_level_tags)):
            if i == len(first_level_tags) - 1:
                first_pattern_str = '({}.*)'.format(first_level_tags[i])
            else:
                first_pattern_str = '({}.*?){}'.format(first_level_tags[i], first_level_tags[i + 1])
            first_level_html = re.search(first_pattern_str, str(html)).group(1)
            # print(first_level_html)

            # 找到二级目录的标签
            second_level_tags = [str(_) for _ in B(first_level_html, 'html.parser').find_all('h3')]
            for j in range(len(second_level_tags)):
                # print(j)
                # print(len(second_level_tags))
                if j == len(second_level_tags) - 1:
                    second_pattern_str = '({}.*)'.format(second_level_tags[j])
                else:
                    second_pattern_str = '({}.*?){}'.format(second_level_tags[j], second_level_tags[j + 1])
                second_level_html = re.search(second_pattern_str, first_level_html).group(1)
                all_tags.append(second_level_html)
        # print(all_tags)
        dic = {
        }
        second_level_dic = {
            '工商信息': 'baseInfo',
            '分支机构': 'branch',
            '变更记录': 'changeInfo',
            '主要人员': 'staffCount',
            '股东信息': 'holderInfo',
        }
        for _ in all_tags:
            html = B(_, 'html.parser')
            title = html.find('h3').find('span').get_text(strip=True)
            if '工商信息' in title:
                data = self.base_parse(html)
                if data:
                    dic[second_level_dic[title]] = data
            elif title in ['分支机构','变更记录']:
                data = self.table_parse(html)
                if data:
                    dic[second_level_dic[title]] = data
Esempio n. 5
0
def parsing_page(data):
  soup = B(data,'html.parser')
  img_list = []
  for i in soup.find_all('div',class_='single-post')[1:]:
      img_title = i.find('h2').text.strip()
      print(img_title)
      img = i.find('img')['style'].split()[1]
      img_list.append(img)
      saving_img(img_title,img)
  print('Total image link got ={0}'.format(len(img_list)))
Esempio n. 6
0
def cityinfo(bot, update):
    message = update.message.text
    city = message[5:]
    html = urlopen('http://nesiditsa.ru/city/' + city)
    soup = B(html, "html.parser")
    content = soup.find('div', 'city-info-block row')
    sections = content.find_all('td')
    #joke_list = [i.find('div', 'anekdot_text').text for i in content]
    bot.sendMessage(chat_id=update.message.chat_id,
                    text=city[0].upper() + city[1:] + ' city:')
    for i in sections:
        bot.sendMessage(chat_id=update.message.chat_id, text=i.text)
Esempio n. 7
0
def parsing_page(url):
  header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
  }
  page = requests.get(url,headers=header)
  print('Received status code {0}'.format(page.status_code))
  soup = B(page.content,'html.parser')
  for i in soup.find_all('tr')[1:]:
    print('working on it')
    try:
      data = parser(i)
      save_to_db(data)
    except Exception as e:
      print('Something happen wrong {0}'.format(e))
Esempio n. 8
0
def hindi(sing):
    ext = ".html"

    url = "https://gaana.com/lyrics/amp/"

    links = "".join([url, sing_song, ext])

    page = requests.get(links)

    # Parsing the HTML request
    soup = B(page.content, "html.parser")

    # Finding the lyrics name
    lyrics_name = soup.find_all("li", class_="current")[0]
    # Extracting the lyrics name
    name_extr = lyrics_name.get_text()
    print(name_extr)
    print()

    # Finding the lyrics
    lyrics = soup.find_all("pre")[0]
    # Extracting the lyrics
    lyrics_extr = lyrics.get_text()
    print(lyrics_extr)
Esempio n. 9
0
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup as B
try:
    html = urlopen('https://coreyms.com/')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server is not found!')

bsobj = B(html.read(), 'lxml')
for article in bsobj.find_all('article'):
    title = article.h2.a.text
    content = article.div.p.text
    link = article.find('iframe', class_='youtube-player')
    print(title)
    print(content)
    print(link)
    print()
# print(get_into.h2.a.text)
# for i in bsobj.
# print(len(img))
# print(img[0].get_text())

# print(list(bsobj.body))
Esempio n. 10
0
import requests as rq
from bs4 import BeautifulSoup as B

base_url='https://edward.kmu.ac.kr/nx/'
#page_path='/page%d'
#page=2

res=rq.get(base_url)
soup=B(res.content, 'lxml')

posts=soup.select('body div.mainframe_VFrameSet_HFrameSet_VFrameSet1_WorkFrame_Child_M503056_form_div_Work_Tab01_tabpage9_grd_scho101_body_gridrow_7_cell_7_4GridCellTextContainerElement')
                            mainframe_VFrameSet_HFrameSet_VFrameSet1_WorkFrame_Child_M503056_form_div_Work_Tab01_tabpage9_grd_scho101_body_gridrow_4_cell_4_4GridCellTextContainerElement
for post in posts:
    title=post.find('h3').text.strip()
    descript=post.find('h4').text.strip()
    author=post.find('span').text.strip()
    print(title, descript, author +"\n")
    
while True:
    sub_path=page_path%(page)
    page+=1
    res=rq.get(base_url + sub_path)

    if(res.status_code !=200):
        break

    soup=B(res.content, 'lxml')
    
    posts=soup.select('body main.page-content div.wrapper div.home div.p')

    for post in posts:
Esempio n. 11
0
def get_random_joke():
    html = urlopen('http://anekdotme.ru/lenta/page_' + str(randint(1, 464)))
    soup = B(html)
    content = soup.find_all('div', 'anekdot')
    joke_list = [i.find('div', 'anekdot_text').text for i in content]
    return str(joke_list[randint(0, len(joke_list))])
Esempio n. 12
0
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 19:04:11 2018

@author: arvin
"""

from bs4 import BeautifulSoup as B
from urllib.request import Request, urlopen
req = Request(
    'https://www.quora.com/search?q=US+Mortgage',
    headers={
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
    })
page = urlopen(req).read()
soup = B(page, 'html.parser')
#print(soup)
res = soup.find_all('span', class_="question_text")
for i in res:
    #print(i.string)
    print(i.get_text())  #To get only the text

#questions = soup.find(class_='Question')
Esempio n. 13
0
#-*- coding:UTF-8 -*-
import requests,re
from common import redis_conn
from bs4 import BeautifulSoup as B


url = "https://www.tianyancha.com/search?key=%E5%BA%B7%E4%B8%96%E4%BF%AD"
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Cookie': 'ssuid=7703879792; TYCID=8971320038af11e99421a35550a0a6c7; undefined=8971320038af11e99421a35550a0a6c7; _ga=GA1.2.210168479.1551066219; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E9%2583%25AD%25E8%2594%25B7%25E8%2596%2587%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25221%2522%252C%2522monitorUnreadCount%2522%253A%2522129%2522%252C%2522onum%2522%253A%252240%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTg0NDUwMTE0MiIsImlhdCI6MTU1MTA2NjI0MiwiZXhwIjoxNTY2NjE4MjQyfQ.6BJfIf_rAdYIwkneCRXeic9ZtL7xY4mGErRIZo_vCWhqC6k8-POwOQn95M24lAnY6CrFZE2NIwmNtOglyR5_zA%2522%252C%2522pleaseAnswerCount%2522%253A%25221%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215844501142%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTg0NDUwMTE0MiIsImlhdCI6MTU1MTA2NjI0MiwiZXhwIjoxNTY2NjE4MjQyfQ.6BJfIf_rAdYIwkneCRXeic9ZtL7xY4mGErRIZo_vCWhqC6k8-POwOQn95M24lAnY6CrFZE2NIwmNtOglyR5_zA; __insp_ss=1551075666199; aliyungf_tc=AQAAAIRlRSq+mAQAQBSWtu0EKZQZqkeP; csrfToken=HKgW16znHhTzPMQsyL4_cSYd; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1551066219,1551403276,1551659277; refresh_page=null; bannerFlag=true; _gid=GA1.2.838233971.1551920883; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1551947451; __insp_wid=677961980; __insp_slim=1551947452884; __insp_nv=true; __insp_targlpu=aHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTUlQkElQjclRTQlQjglOTYlRTQlQkYlQUQ%3D; __insp_targlpt=5bq35LiW5L_tX_ebuOWFs_aQnOe0oue7k_aenC3lpKnnnLzmn6U%3D; __insp_norec_sess=true; __insp_slim=1551920897962',
    'Host': 'www.tianyancha.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
s = requests.session()
res = s.get(url,headers=headers)
html = B(res.text, 'html.parser')
spans = html.find_all('span',class_='tt hidden')
key_list = [
        'phoneList',
        'emailList',
        'id',
        'name',
        'regStatus',
        'base',
        'regCapital',
        'estiblishTime',
        'creditCode',
        'regLocation',
        'businessScope',
        'categoryStr',
        'city',
Esempio n. 14
0
    "Host":"www.smzdm.com",
    "Referer":"https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/",
    "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}

todoList=[]
taskList=[]
infos=[]

lnkPrefix="https:"
lnk="https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/"
visited=[lnk]

rp=ses.get(lnk)
s=B(rp.text,"html.parser")

updateTodo(s)
getMsg(s)

while len(todoList)!=0 or len(taskList)!=0:
    if len(todoList)==0:
        pass
    else:
        visited.append(todoList.pop())
        lnk=lnkPrefix+visited[-1]
        #print("Getting %s"%lnk)
        tr.Thread(target=nowtask,args=(lnk,)).start()
        

with open("DumpResource2.txt","w") as f:
Esempio n. 15
0
 def request_jisho(self, key):
     html = requests.get(f'https://jisho.org/search/{key}').content
     b = B(html, 'lxml')
     return b
Esempio n. 16
0
    "https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/",
    "Upgrade-Insecure-Requests":
    "1",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}

todoList = []
infos = []

lnkPrefix = "https:"
lnk = "https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/"
visited = [lnk]

rp = ses.get(lnk)
s = B(rp.text, "html.parser")

updateTodo(s)
getMsg(s)

while len(todoList) != 0:
    visited.append(todoList.pop())
    lnk = lnkPrefix + visited[-1]
    #print("Getting %s"%lnk)
    s = B(ses.get(lnk).text, "html.parser")
    updateTodo(s)
    getMsg(s)

with open("DumpResource.txt", "w") as f:
    f.write(json.dumps(infos))
print(time.time() - timeS)
Esempio n. 17
0
# http://docs.python-guide.org/en/latest/scenarios/scrape/
import time as t
import requests as r
from lxml import html
from bs4 import BeautifulSoup as B

# Let the magic(finally!) begin?
base_url = 'http://www.dotabuff.com/esports'
pages = [
    '/events/121/series', '/leagues/4716/series',
    '/leagues/4716/series?page=2', '/events/112/series', '/leagues/4700/series'
]

all_match_ids = set()
for page in pages:
    t.sleep(1)
    p = r.get(base_url + page, headers={'User-agent': 'Mozilla/5.0'})
    soup = B(p.content)
    for link in soup.find_all('a'):
        l = link.get('href').split('/')
        try:
            if (l[1] == 'matches'):
                match_id = l[2]
                all_match_ids.add(match_id)
        except:
            pass

f = open('matchids.txt', 'w')
for match in all_match_ids:
    f.write(match + '\n')
f.close()
Esempio n. 18
0
def crawler(url, neighbour_names, hrff, name, dictf):
    header = {
        "dnt":
        "1",
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    }
    re = requests.get(url, headers=header)
    base = B(re.content, 'html.parser')
    data_dictionary = {
        "Location": name,
        "zillow-value": "Nodata",
        "one-year-change": "Nodata",
        "one-year-forcast": "Nodata",
        "market-temperature": "Nodata",
        "price-sqft": "Nodata",
        "median-listing-price": "Nodata",
        "median-sale-price": "Nodata",
        "avg-days-on-market": "Nodata",
        "negative-equity": "Nodata",
        "delinquincy": "Nodata",
        "rent-list-price": "Nodata",
        "rent-sqft": "Nodata",
    }

    try:
        market_temp = base.find('div', {'class': 'market-temperature'})
        temperature = market_temp.find('div', {'class': 'zsg-h2'}).text
        data_dictionary['market-temperature'] = temperature
    except:
        pass

    try:
        outer = base.find('section', {'class': 'zm-forecast-chart'})
        content_box = outer.find('ul', {'class': 'zsg-g'})
        all_li = content_box.find('li', {'class': 'zsg-lg-1-2'})
        sp = all_li.find('span', {'class': 'zsg-fineprint'})
        sp.decompose()
        all_li = content_box.find('li', {'class': 'zsg-lg-1-2'}).text
        temp = all_li.replace(" ", "")
        temp = temp.replace("\n", "")
        temp = temp.replace("%", "")
        data_dictionary['one-year-change'] = temp
    except:
        pass

    try:
        outer = base.find('section',
                          {'class': 'zsg-content-section market-overview'})
        content_box = outer.find('ul', {'class': 'value-info-list'})
        all_li = content_box.find_all('li')
        ab = []
        for i in all_li:
            temp = i.find('span', {'class': 'value'}).text
            temp = temp.replace(" ", "")
            temp = temp.replace("\n", "")
            temp = temp.replace("%", "")
            temp = temp.replace("$", "")
            ab.append(temp)
        data_dictionary['zillow-value'] = ab[0]
        data_dictionary['one-year-forcast'] = ab[1]
        data_dictionary['median-listing-price'] = ab[2]
        data_dictionary['median-sale-price'] = ab[3]
    except:
        pass

    try:
        outer = base.find('section',
                          {'class': 'zsg-content-section market-health'})
        content_box = outer.find('ul', {'class': 'value-info-list'})
        all_li = content_box.find_all('li')
        ab = []
        for i in all_li:
            temp = i.find('span', {'class': 'value'}).text
            temp = temp.replace(" ", "")
            temp = temp.replace("\n", "")
            temp = temp.replace("%", "")
            ab.append(temp)
        if len(ab) >= 3:
            data_dictionary['avg-days-on-market'] = ab[0]
            data_dictionary['negative-equity'] = ab[1]
            data_dictionary['delinquincy'] = ab[2]
        else:
            data_dictionary['negative-equity'] = ab[0]
            data_dictionary['delinquincy'] = ab[1]
    except:
        pass
    if data_dictionary['negative-equity'] != 'Nodata':
        data_dictionary['negative-equity'] = round(
            float(data_dictionary['negative-equity']) / 100, 3)
    if data_dictionary['delinquincy'] != 'Nodata':
        data_dictionary['delinquincy'] = round(
            float(data_dictionary['delinquincy']) / 100, 3)

    try:
        outer = base.find('section',
                          {'class': 'zsg-content-section listing-to-sales'})
        content_box = outer.find('ul', {'class': 'value-info-list'})
        all_li = content_box.find_all('li')
        ab = []
        for i in all_li:
            temp = i.find('span', {'class': 'value'}).text
            temp = temp.replace(" ", "")
            temp = temp.replace("\n", "")
            temp = temp.replace("$", "")
            ab.append(temp)
        data_dictionary['price-sqft'] = ab[0]
    except:
        pass

    try:
        outer = base.find_all('section',
                              {'class': 'zsg-content-section region-info'})
        content_box = outer[1].find('ul', {'class': 'value-info-list'})
        spans = content_box.find_all('span', {'class': 'value'})
        ab = []
        for i in spans:
            temp = i.text
            temp = temp.replace(" ", "")
            temp = temp.replace("\n", "")
            temp = temp.replace("$", "")
            ab.append(temp)
        data_dictionary['rent-list-price'] = ab[1]
        data_dictionary['rent-sqft'] = ab[2]
    except:
        pass
    print(data_dictionary)

    dictf.append(data_dictionary)
    try:
        nearby = base.find('section',
                           {'class': 'zsg-content-section nearby-regions'})
        neighbourhoods = nearby.find('div', {
            'class': 'zsg-content-section'
        }).text.split()
        if 'Neighborhoods' in neighbourhoods:
            tables = nearby.find_all('table')
            for k in tables:
                at = k.find_all('a')
                for p in at:
                    n_n = p.text
                    n_l = p['href']
                    if n_n not in neighbour_names and n_l not in hrff:
                        neighbour_names.append(n_n)
                        hrff.append(n_l)
    except:
        pass
Esempio n. 19
0
def update_moose_tcg():
    moose_inventory = MooseInventory.objects
    start_time = time()
    # Entire Moose Loot Listed inventory
    listed_cards = api.get_category_skus('magic')
    if listed_cards['success'] is True:
        print(f"Updating {listed_cards['totalItems']} for Moose Inventory")
        for index, card in enumerate(listed_cards['results']):
            try:
                condition = card['conditionName']
                printing = card['printingName']
                print(index)
                if condition != 'Unopened':
                    current_price = card['currentPrice']
                    low = card['lowPrice']
                    if current_price != low:
                        sku = card['skuId']
                        product_id = card['productId']
                        name = card['productName']
                        expansion = card['groupName']
                        market = card['marketPrice']
                        language = card['languageName']
                        '''    
                        If the card is not English it will be priced at the low price minus one cent.

                        For each card in the MooseLoot inventory we will make a request to the tcgplayer page containing all seller data for a given 
                        product. 
                        We request and scan pages (10 results per page) until we find 2 listings with sellers that have 10,000 sales or more. We break the while loop 
                        once we have found those two listings and move on to the next card. In the case where only one or zero listings are found, 
                        we break the loop and use one price to match against or default to the market price.      
                        '''
                        if language != 'English' and printing != 'Foil':
                            # catch instances where there is no low price
                            try:
                                updated_price = low - .01
                            except TypeError:
                                updated_price = None

                            if updated_price is not None:
                                api.update_sku_price(sku_id=sku,
                                                     price=updated_price,
                                                     _json=True)
                        else:

                            card_data = {
                                'card_name': '',
                                'card_set': '',
                                'card_condition': '',
                                'seller_1_name': '',
                                'seller_1_total_sales': '',
                                'seller_2_name': '',
                                'seller_2_total_sales': '',
                                'seller_1_total_price': '',
                                'seller_2_total_price': '',
                                'Updated_price': '',
                            }

                            next_page = True
                            page = 1
                            seller_data_list = []

                            while next_page is True:
                                request_path = url(product_id=product_id,
                                                   condition=condition,
                                                   foil=printing,
                                                   page=page)
                                r = requests.get(request_path).content
                                soup = B(r, 'html.parser')
                                data = soup.find_all(
                                    'div', {'class': 'product-listing '})

                                # Check if there are products in the request. If not that indicates no more listings and thus we break the loop
                                if not data:
                                    break
                                # loop over each item on the page and get Seller Info

                                for d in data:
                                    check = d.find('span',
                                                   {'class': 'seller__sales'})
                                    if check is not None:
                                        seller_total_sales = integers_from_string(
                                            d.find('span', {
                                                'class': 'seller__sales'
                                            }).text)
                                        seller_name = d.find(
                                            'a', {
                                                'class': 'seller__name'
                                            }).text.strip()
                                        seller_condition = d.find(
                                            'div', {
                                                'class':
                                                'product-listing__condition'
                                            }).text.strip()
                                        if seller_total_sales >= 10000 and seller_name != 'MTGFirst' and seller_name != 'Moose Loot' and condition == seller_condition:

                                            # seller_feedback = d.find('span', {'class': 'seller__feedback-rating'}).text
                                            # function extracts all floating points from string.
                                            price = float_from_string(
                                                d.find(
                                                    'span', {
                                                        'class':
                                                        'product-listing__price'
                                                    }).text)

                                            # Fail Safe in the case where html is changed and no real value is extracted
                                            if price is not None and price is not 0:
                                                shipping = float_from_string(
                                                    d.find(
                                                        'span', {
                                                            'class':
                                                            'product-listing__shipping'
                                                        }).text.strip())

                                                # 25 would be extracted from shipping text that state "Free shipping over 25". We make this result 0 and
                                                # handle additional shipping costs using defaults
                                                if shipping == 25.:
                                                    shipping = 0

                                                # Default shipping added to cards under five.
                                                if price >= 5:
                                                    total_price = price + shipping
                                                else:
                                                    total_price = price

                                                # We are appending the two cheapest listings with 10,000 minimum sales and that meets other if requirements.
                                                # Break once we get 2
                                                seller_data_list.append(
                                                    total_price)
                                                if len(seller_data_list) == 1:
                                                    card_data[
                                                        'seller_1_name'] = seller_name
                                                    card_data[
                                                        'seller_1_total_sales'] = seller_total_sales
                                                    card_data[
                                                        'seller_1_total_price'] = total_price
                                                    card_data[
                                                        'card_name'] = name
                                                    card_data[
                                                        'card_set'] = expansion
                                                    card_data[
                                                        'card_condition'] = condition

                                                if len(seller_data_list) == 2:
                                                    card_data[
                                                        'seller_2_name'] = seller_name
                                                    card_data[
                                                        'seller_2_total_sales'] = seller_total_sales
                                                    card_data[
                                                        'seller_2_total_price'] = total_price
                                                    next_page = False
                                                    break
                                page += 1
                            '''
                            We will check the number of other seller listings.
                            If there were zero listings found we simply make the updated price the market price.

                            If just one listing is found, we run the price algorithm which will just add shipping if default and price .01c less.

                            If there are 2 10,000+ listings, algorithm will compare and take the best/cheapest listings price
                            '''

                            if len(seller_data_list) == 1:
                                seller_data_list.append(0)

                            updated_price = moose_price_algorithm(
                                seller_data_list=seller_data_list,
                                market_price=market,
                                low_price=low,
                                condition=condition)
                            card_data['updated_price'] = updated_price

                            new = moose_inventory.create(
                                name=card_data['card_name'],
                                expansion=card_data['card_set'],
                                condition=card_data['card_condition'],
                                printing=printing,
                                seller_1_name=card_data['seller_1_name'],
                                seller_1_total_sales=card_data[
                                    'seller_1_total_sales'],
                                seller_1_total_price=card_data[
                                    'seller_1_total_price'],
                                seller_2_name=card_data['seller_2_name'],
                                seller_2_total_sales=card_data[
                                    'seller_2_total_sales'],
                                seller_2_total_price=card_data[
                                    'seller_2_total_price'],
                                updated_price=card_data['updated_price'],
                            )

                            new.save()

                            if updated_price is not None:
                                api.update_sku_price(sku_id=sku,
                                                     price=updated_price,
                                                     _json=True)
                                if index < 100:
                                    print(index, name, expansion, condition,
                                          printing)
                                    print(
                                        f"Current: {current_price}, Market: {market}, low: {low}, Updated: {updated_price}"
                                    )

            except Exception as e:
                print(e)
                subject = "Error on function to update MooseLoot tcg"
                message = f"Error on function to update MooseLoot tcg:\n {card}\n\nSeller Info: {seller_name, seller_total_sales}"
                mail_from = 'tcgfirst'
                mail_to = [
                    '*****@*****.**',
                ]
                send_mail(subject, message, mail_from, mail_to)

    end_time = time()
    elapsed = end_time - start_time
    subject = "Time elapsed for Moose Tcg Auto Price - 1 cycle"
    message = f"Time auto price completed: {elapsed} seconds"
    mail_from = 'tcgfirst'
    mail_to = [
        '*****@*****.**',
    ]
    send_mail(subject, message, mail_from, mail_to)
Esempio n. 20
0
driver = webdriver.Chrome(chromedriver, options=headless_options)

driver.get('https://www.bananamall.co.kr/index.php')
id_field = driver.find_element_by_name("id")
id_field.clear()
id_field.send_keys('amen03')
pw_field = driver.find_element_by_name("passwd")
pw_field.send_keys('2435570js!')
pw_field.send_keys(Keys.RETURN)

time.sleep(2)

driver.get('https://www.bananamall.co.kr/etc/womanizer_sp_v1.php?cl=womanizer')
html = driver.page_source
soup = B(html, 'html.parser')
brand_name = driver.find_elements_by_xpath(
    '/html/body/div[2]/div[2]/div[6]/div[2]/ul/li/a/span[2]')
brand_url = soup.select(
    'div.contents > div.brands.brands_cate.brands_cate_whole.clearfix > ul > li > a'
)
brand_list = []

for num, item in enumerate(brand_name):
    brand_list.append([item.text, brand_url[num]['href']])

for item_url in brand_list:
    driver.get('https://www.bananamall.co.kr' + item_url[1])
    loop, count = True, 0

    while loop and count < 10:
Esempio n. 21
0
def btc():
    html = urlopen('http://bitkurs.ru/')
    soup = B(html)
    usd = soup.find('span', 'usd_c currencies').text
    rub = soup.find('span', 'rub_c currencies').text
    return '1 BTC = %s или %s' % (usd, rub)
Esempio n. 22
0
end_page = int(
    driver.find_element_by_css_selector(
        '#container > div > nav > span > strong').text) + 1
print(end_page)

num1 = 1

for page in range(1, end_page):
    driver.get(
        'https://msdepart.com/adm/shop_admin/orderlist.php?od_status=&od_settle_case=&od_misu=&od_cancel_price=&od_refund_price=&od_receipt_point=&od_coupon=&fr_date='
        + start_data + '&to_date=' + end_date +
        '&sel_field=od_id&search=&save_search=&sort1=od_id&sort2=desc&page=' +
        str(page))

    html = driver.page_source
    soup = B(html, 'html.parser')

    data_raw = soup.select(
        '#sodr_list > tbody > tr > td.td_mng.td_mng_s > a > span.sound_only')

    driver.implicitly_wait(delay)

    num2 = 0

    for index in data_raw:
        site = 'https://msdepart.com/shop/orderinquiryview.php?od_id=' + index.text

        driver.get(site)

        driver.implicitly_wait(delay)
        reader = csv.DictReader(csvfile)
        # reader = itertools.islice(csv.DictReader(csvfile), 1000)

        for row in reader:
            url = row['url']
            if (url):
                wd = webdriver.Chrome()
                wd.get(url)

                # And grab the page HTML source
                html_page = wd.page_source
                wd.quit()

                # Now you can use html_page as you like
                soup = B(html_page, "lxml")

                image_url = soup.find('link', {
                    'itemprop': 'image'
                }).get('href')

            else:
                image_url = 'https://vignette.wikia.nocookie.net/international-entertainment-project/images/9/94/SpongeBob_SquarePants_%28SpongeBob_SquarePants%29.png'
                #failed_url_list.append(url)
            image_url_list.append(image_url)
except:
    pass

csv_input['image_url'] = image_url_list
csv_input.to_csv('Data/output1.csv', index=False)
Esempio n. 24
0
    def detail_parse(self, url):
        # url = company_dic.get('companyUrl')
        self.driver.get(url)
        time.sleep(3)
        # print(self.driver.page_source)
        try:
            html = B(str(self.driver.page_source), 'lxml')
            header_html = html.find('div', class_='detail')
        except Exception as e:
            log.error('获取头部信息失败 {}'.format(e))
            return
        divs = header_html.find_all('div', class_='in-block')
        header_data = {}
        # 公司logo
        try:
            header_data['clUrl'] = html.find('div', class_='logo -w100').attrs['data-src']
        except Exception:
            pass
        # 曾用名
        try:
            header_data['usedName'] = html.find('div', class_='history-content').get_text(strip=True)
        except Exception:
            pass
        # 上市信息
        try:
            # 股票板块
            header_data['plate'] = html.find('span', class_='line').get_text()
            bond = html.find('span', class_='bond').get_text()
            bond_name = html.find('span', class_='bond_name').get_text()
            # 股票代号
            header_data['stockNum'] = bond + bond_name
        except Exception:
            pass
        # 头部基本信息
        header_dic = {
            '电话:': 'companyTel',
            '邮箱': 'companyEmail',
            '网址': 'companyWebeUrl',
            '地址': 'registerAddress',
            '简介': 'companyBrief',
        }
        for name, value in header_dic:
            try:
                if '网址' in name:
                    header_data[value] = divs.find('span',text='网址:').find('a').get_text(strip=True)
                else:
                    header_data[value] = divs.find('span', text=name).find('script', attrs={'type': 'text/html'}).get_text(strip=True)
            except Exception:
                pass

        """工商信息"""
        base_data = {}
        try:
            tables = html.find('div',id='_container_baseInfo').find_all('table')
        except Exception:
            log.error('工商信息获取失败')
            return

        # 法人及法人logo
        try:
            base_data['legalMan'] = tables[0].find('div',class_='humancompany').get_text(strip=True)
            base_data['mlUrl'] = tables[0].find('div',class_='lazy-img -image').img.attrs['data-src']
        except Exception:
            pass

        # 注册资本
        try:
            base_data['registerMoney'] = tables[0].find_all('tr', recursive=False)[0].find_all('td',recursive=False)[1].find_all('div',recursive=False)[1].attrs['title']
        except Exception:
            pass

        # 注册时间
        try:
            # base_data['registerTime']
            registerTimes  = tables[0].find_all('tr', recursive=False)[1].td.find_all('div',recursive=False)[1].get_text(strip=True).split('-')
            for registerTime in registerTimes:
                for _ in registerTime:
        except Exception:
            pass
Esempio n. 25
0
    def parse_and_get_list_company(self):
        '''
        获取列表页的公司数据
        :return:
        '''
        """另一种策略,不具体定位class里面的值,因为class里面的值会变,所以定位到标签,再用正则做匹配"""
        try:
            # 获取页面html
            html = B(self.driver.page_source, 'html.parser')
            # 找到所有的公司外层的模块
            div_lists = html.find_all('div',
                                      attrs={'data-id': re.compile('\d+')})
        except Exception as e:
            log.info('[error]: 找不到所有的公司外层的模块{}'.format(e))
            self.db.account_results.update_one(
                {'_id': self.accunt_item['_id']}, {'$set': {
                    "flag": 0
                }})
            self.driver.quit()
            quit()

        for div in div_lists:
            dic = {}
            try:
                tmp = div.find(
                    'a',
                    attrs={
                        'href':
                        re.compile('https://www.tianyancha.com/company/\d+')
                    })
                # 公司名称
                dic['companyName'] = tmp.get_text(strip=True)
                # 公司url
                dic['companyUrl'] = tmp.attrs['href']
                # 经营状态
                dic['businessState'] = tmp.next_sibling.get_text(strip=True)
                # 公司所属省份
                dic['companyProvince'] = div.contents[2].get_text(strip=True)
            except Exception as e:
                log.info('[error]: {}'.format(e))
                quit()
                # log.info('[error]: {}'.format(e))
            # 法人/注册资本/注册时间/联系电话/邮箱/法人信息
            tags = div.contents[1].contents[1:-2]

            data = []
            for tag in tags:
                for _ in tag.contents:
                    data.append(_.get_text(strip=True))
            # 对初步解析的文本进一步分割
            tmp_dic = {
                '法定代表人': 'legalMan',
                '代表人': 'representMan',
                '负责人': 'chargeMan',
                '注册资本': 'registerMoney',
                '注册时间': 'registerTime',
                '联系电话': 'companyTel',
                '邮箱': 'companyEmail',
            }
            for _ in data:
                key, value = _.split(":")
                # 联系电话可能存在多个
                if key in ['法定代表人', '代表人', '负责人']:
                    # 法定代表人url
                    try:
                        dic['manUrl'] = div.find('a', attrs={
                            'title': value
                        }).attrs['href']
                    except Exception as e:
                        log.info('[error]: 获取法人链接失败{}'.format(e))
                if key in ['联系电话', '邮箱']:
                    try:
                        tel_lists = re.search('.*\[(.*)\].*',
                                              value.replace(
                                                  '\"',
                                                  '')).group(1).split(',')
                    except Exception:
                        tel_lists = [value]
                    dic[tmp_dic[key]] = tel_lists
                else:
                    dic[tmp_dic[key]] = value
            # 数据存储
            self.save_data(dic)
        """---------"""
Esempio n. 26
0
import smtplib
import requests as R
from bs4 import BeautifulSoup as B
import datetime

url = 'https://news.ycombinator.com'
req = R.get(url)
data = req.text
soup = B(data, "html.parser")

#lt_text = soup.find_all("a",class_ = 'storylink').string.strip()
#lt_link = soup.find_all("a",class_ = 'storylink').get('href')

lt_text = soup.find_all("a", class_='storylink')

email_text = ''

for i in range(10):
    email_text += str(i +
                      1) + '. ' + lt_text[i].string + ' \n ' + lt_text[i].get(
                          'href') + ' \n '

#print(email_text.encode('utf-8'))

email_text = email_text.encode('utf-8')

subject = 'HackerNews Headlines : ' + str(datetime.date.today())

gmail_user = '******'  #mailid
gmail_password = '******'  #password
Esempio n. 27
0
from bs4 import BeautifulSoup as B

html = """<html><head><title>test site</title></head>
        <body> <p>test1</p> <p id='d'>test2</p> <p>test3</p>
        </body></html>"""

soup = B(html, 'lxml')

print(soup.find_all(id='d'))
Esempio n. 28
0
 def read_html(self):
     with open('tyc.html', 'r', encoding='utf-8') as f:
         html = B(f.read(), 'html.parser', )
     return html
Esempio n. 29
0
def nowtask(nowlnk):
    taskList.append(nowlnk)
    ss=B(ses.get(nowlnk).text,"html.parser")
    updateTodo(ss)
    getMsg(ss)
    taskList.remove(nowlnk)
Esempio n. 30
0
    def html_detail(self):
        """
        解析模块,解析列表页面
        :return: 解析后的数据
        :rtype:dict or None
        """
        """另一种策略,不具体定位class里面的值,因为class里面的值会变,所以定位到标签,再用正则做匹配"""
        try:
            # 获取页面html
            html = B(self.driver.page_source, 'html.parser')
            # 找到所有的公司外层的模块
            # div_lists = html.find_all('div', attrs={'data-id': re.compile('\d+')})
            div_lists = html.find(
                'div', class_='result-list sv-search-container').find_all(
                    'div', attrs={'data-id': re.compile('\d+')})
        except Exception as e:
            self.log.error('找不到所有的公司外层的模块{}'.format(e))
            return

        items = []

        for div in div_lists:
            dic = {}
            # 基本信息json
            try:
                base_txt = str(div.find('span', class_='tt hidden').get_text())
            except Exception:
                base_txt = None
            # 找不到基本信息的json,可能是事业单位这种类型的,走下面那个逻辑
            if base_txt:
                # 基本信息字典
                base_txt = base_txt.replace('\"\"', 'None').replace(
                    '\'', '\"').replace('null',
                                        'None').replace('true', 'True')
                try:
                    base_dic = self.re_sub(base_txt)
                except Exception:
                    base_txt = base_txt.replace('\"\"', 'None').replace(
                        '\'', '\"').replace('null',
                                            'None').replace('true', 'True')
                    base_dic = eval(base_txt)
                # 公司名称
                dic['companyName'] = base_dic.get('name')
                # 公司url
                dic['companyUrl'] = "https://www.tianyancha.com/company/{}".format(
                    base_dic.get('id'))
                # 营业状态
                dic['businessState'] = base_dic.get('regStatus')
                # 省份
                dic['companyProvince'] = base_dic.get('base')
                # 注册资金
                dic['registerMoney'] = base_dic.get('regCapital')
                # 注册时间
                dic['registerTime'] = base_dic.get('estiblishTime').split(
                    ' ')[0]
                # 联系电话
                dic['companyTel'] = base_dic['phoneList'] if base_dic.get(
                    'phoneList') else ''
                # 邮箱
                dic['companyEmail'] = base_dic.get(
                    'emailList') if base_dic.get('emailList') else ''
                # 统一信用代码
                dic['creditCode'] = base_dic.get('creditCode')
                # 注册地址
                dic['registerAddress'] = base_dic.get('regLocation')
                # 经营范围
                dic['businessScope'] = base_dic.get('businessScope')
                # 所属行业
                dic['industry'] = base_dic.get('categoryStr')
                # 所属城市
                dic['companyCity'] = base_dic.get('city')
                # 所属地区
                dic['companyArea'] = base_dic.get('district')
                # 基本信息json
                dic['base_txt'] = base_txt
            else:
                try:
                    tmp = div.find(
                        'a',
                        attrs={
                            'href':
                            re.compile(
                                'https://www.tianyancha.com/company/\d+')
                        })
                    # 公司名称
                    dic['companyName'] = tmp.get_text(strip=True)
                    # 公司url
                    dic['companyUrl'] = tmp.attrs['href']
                except Exception as e:
                    self.log.error(e)
                    continue
                # 经营状态
                try:
                    dic['businessState'] = tmp.next_sibling.get_text(
                        strip=True)
                except Exception:
                    dic['businessState'] = ''
                # 公司所属省份
                try:
                    dic['companyProvince'] = div.contents[3].get_text(
                        strip=True)
                except Exception:
                    dic['companyProvince'] = ''

                # 法人/注册资本/注册时间/联系电话/邮箱/法人信息
                tags = div.contents[2].contents[1:-1]
                data = []
                for tag in tags:
                    for _ in tag.contents:
                        data.append(_.get_text(strip=True))
                # 对初步解析的文本进一步分割
                tmp_dic = {
                    '法定代表人': 'legalMan',
                    '代表人': 'representMan',
                    '负责人': 'chargeMan',
                    '注册资本': 'registerMoney',
                    '资本总额': 'registerMoney',
                    '注册时间': 'registerTime',
                    '联系电话': 'companyTel',
                    '邮箱': 'companyEmail',
                }
                for _ in data:
                    try:
                        key, value = _.split(":")
                    except Exception:
                        continue
                    # 联系电话可能存在多个
                    if key in ['法定代表人', '代表人', '负责人']:
                        # 法定代表人url
                        try:
                            dic['manUrl'] = \
                            div.find('a', attrs={'title': value}).attrs['href']
                        except Exception:
                            pass
                            # log.error('获取法人链接失败')
                    if key in ['联系电话', '邮箱']:
                        try:
                            tel_lists = re.search('.*\[(.*)\].*',
                                                  value.replace(
                                                      '\"',
                                                      '')).group(1).split(',')
                        except Exception:
                            tel_lists = [value]
                        dic[tmp_dic[key]] = tel_lists
                    else:
                        try:
                            dic[tmp_dic[key]] = value
                        except Exception:
                            pass
            # 数据存储
            items.append(dic)
        return items if items else None