def html_to_match_objects(html_page):

    soup = BeautifulSoup(html_page, 'html.parser')

    title_div = soup.div(class_="sidebar-body-title ng-binding")

    if title_div != "":
        title = title_div[0].text
        print(title)

    match_data = soup.div(class_="sidebar-body-item ng-scope")

    if match_data == 0:
        return

    string_list = []

    # get all table content
    for match in match_data[0].find_all('tr'):
        if match != "":
            string_list.append(match.text)

    data_list = string_to_list(string_list)

    return generate_objects(data_list)
Ejemplo n.º 2
0
def function3(link4):
	'''scrap all the phone attribute from the page ie http://www.gsmarena.com/apple_ipad_air-5797.php
	 and add it to the database '''
	try:
		dict_y_update()
		data3 = requests.get(link4)
		soup5 = BeautifulSoup(data3.text)
		soup6 = BeautifulSoup(str(soup5.div(id="specs-list")))

		def empty_tbl_dt1():
				p = "_" + tbl_dt1.get_text().replace(u'\xa0','h').replace(u'\xc2','h').replace(u' ','_').replace(u'.','_').replace(u'-','_')
				if p == ("_h"):
					return "_" + table.th.get_text() + "_Extra"
				else: 
					if p == ('_hh'):
						return "_" + table.th.get_text() + "_Extra"
					else:
						return p
		Comments = ''
		for para in soup6.find_all('p'):
			Comments = Comments + para.get_text() + ". "
		Extra_comments = str(Comments.encode('utf-8').replace('[. ',''))
		model_name = soup5.h1.get_text()
		img_url = BeautifulSoup(str(soup5.div(id="specs-cp-pic"))).img["src"]
		print model_name, img_url , Extra_comments
		company = model_name.split()[0]
		try:
			connect_to_db()
			cur.execute("SELECT * from models where model_name = (%s)",[model_name])
			cur.fetchone()[1]
			print "--------------Alredy Present In Database-------------------"
		except:
			for table in soup6.find_all("table"):
				soup7 = BeautifulSoup(str(table))

				for tbl_dt1,tbl_dt2 in zip(soup7.find_all("td",class_="ttl"),soup7.find_all("td",class_="nfo")):
						#print empty_tbl_dt1(), "---------" , tbl_dt2.get_text().encode("utf-8")
						y.update({"_model_name":model_name,"_gsm_link":link4,"img_url":img_url,"Extra_comments":Extra_comments,"company":company})
						y.update({empty_tbl_dt1():tbl_dt2.get_text().encode("utf-8")})

			print "--------------------------------------------------------"

			for key,value in dict.items(y):
					print key ,"--" ,value
			print "--------------------------------------------------------"
			print "--------------------------------------------------------"
			connect_to_db()
			cur.execute("INSERT INTO models (model_name, Company, Gsm_link, Extra_comments, Image_url, _2G_Network, _3G_Network, _4G_Network, Sim, Announced, Status, General_Extra, Dimensions, Weights, Keyboard, Body_Extra, Type, Size, Multitouch, Protection, Display_Extra, Alert_Types, Loudspeaker, _3_5mm_jack, Sound_extra, Card_Slot, Internal, Phonebook, Call_Records, Memory_Extra, GPRS, EDGE, Speed, WLAN, Bluetooth, Infrared_Port, USB, NFC, DATA_Extra, _Primary, Features, Video, Secondary, Camera_Extra, OS, Chipset, CPU, GPU, Sensors, Messaging, Browser, Radio, GPS, Java, Colours, Games, Clock, Alarm, Languages, Features_Extra, Battery_Extra, Stand_By, Talk_Time, Music_Play, Price_Group, SAR_US, SAR_EU, MISC_Extra ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",[y['_model_name'],y['company'],y['_gsm_link'],y['Extra_comments'],y['img_url'],y['_2G_Network'],y['_3G_Network'],y['_4G_Network'],y['_SIM'],y['_Announced'],y['_Status'],y['_General_Extra'],y['_Dimensions'],y['_Weight'],y['_Keyboard'],y['_Body_Extra'],y['_Type'],y['_Size'],y['_Multitouch'],y['_Protection'],y['_Display_Extra'],y['_Alert_types'],y['_Loudspeaker_'],y['_3_5mm_jack_'],y['_Sound_Extra'],y['_Card_slot'],y['_Internal'],y['_Phonebook'],y['_Call_records'],y['_Memory_Extra'],y['_GPRS'],y['_EDGE'],y['_Speed'],y['_WLAN'],y['_Bluetooth'],y['_Infrared_port'],y['_USB'],y['_NFC'],y['_DATA_Extra'],y['_Primary'],y['_Features'],y['_Video'],y['_Secondary'],y['_Camera_Extra'],y['_OS'],y['_Chipset'],y['_CPU'],y['_GPU'],y['_Sensors'],y['_Messaging'],y['_Browser'],y['_Radio'],y['_GPS'],y['_Java'],y['_Colors'],y['_Games'],y['_Clock'],y['_Alarm'],y['_Languages'],y['_Features_Extra'],y['_Battery_Extra'],y['_Stand_by'],y['_Talk_time'],y['_Music_play'],y['_Price_Group'],y['_SAR_US'],y['_SAR_EU'],y['_MISC_Extra']])
			disconnect_to_db()
			y.clear()
			dict_y_update()
	except:
		connect_to_db()
		cur.execute("INSERT INTO errorlogs (error_link) VALUES (%s)",[link4])
		disconnect_to_db()
		y.clear()
		dict_y_update()
Ejemplo n.º 3
0
def gettingOldData():
    #droup = BeautifulSoup(open("historyUser.html", "r"), "html.parser")
    droup = BeautifulSoup("historyUser.html", "html.parser")
    oldDiv = droup.div("target")
    newCal = oldDiv
    droup.close()
    return oldCal
Ejemplo n.º 4
0
    def parse_commit(self, branch):
        N=test_last_page(baseURL+branch.commit_url)
        print 'Branch: %s' % branch.branch_name
        print 'Total pages:%s' % N
        for i in range(N, 0, -1):
            try:
                req=urllib2.urlopen(baseURL+branch.commit_url+'?page='+str(i))
                result=req.read()
                soup=BeautifulSoup(result)
                commit_list=[]
                for d in soup.div():
                    if d.has_attr('class') and 'js-navigation-container' in d.attrs['class']:
                        h3_list=d.findAll('h3')
                        ol_list=d.findAll('ol')
                        if len(h3_list)==len(ol_list):
                            for index in range(len(h3_list)):
                                h3_date=datetime.datetime.strptime(h3_list[index].string, '%b %d, %Y').date()
                                for li in ol_list[index].findAll('li'):
                                    commit=Commit(li.p.a['href'], h3_date)
                                    commit.parse_parent_info()
                                    sys.stderr.write('Parent info %s\n' % '\t'.join(commit.parent_sha_list))
                                    commit_list.append(commit)
                        else:
                            print 'Error! h3 and ol do not match!'
                commit_list.reverse()
                for commit in commit_list:
#                    self.branch_commit_fp.write('%s %s %s %s\n' % (branch.branch_name, commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), '\t'.join(commit.parent_sha_list)))
                    self.logger.info('Commit:%s (%s) in Branch:%s Parent:%s' % (commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), branch.branch_name, '\t'.join(commit.parent_sha_list)))
                    if commit not in self.visited_commit:
#                        self.retrieve_commit(commit)
                        self.visited_commit.add(commit)
            except urllib2.HTTPError, e:
                print e
def scrape(serial_number):
    scraper = PyScraper()
    #scraper.get('http://www.cpic-cipc.ca/English/searchformbikes.cfm')
    
    url = 'http://app.cpic-cipc.ca/English/searchFormResultsbikes.cfm'
    raw_params = {
        'ser': serial_number,
        #'sType': 'Bicycles',
        'Submit': 'Begin Search',
    }
    
    params = urllib.urlencode(raw_params)
    data = scraper.post(url, params)

    soup = BeautifulSoup(data)
    entries = []
    main = soup.div(id='wb-main-in')
    hrs = soup.findAll('hr',title="")
    for hr in hrs:
        entry = {}
        p = hr.find_next_sibling("p")
        entry = {
          'Status': p.find("strong", text="Status:").find_all_next(text=True)[1],
          'Serial': p.find("strong", text="Status:").find_all_next(text=True)[4],
          'Make'  : p.find("strong", text="Status:").find_all_next(text=True)[7],
          'Model' : p.find("strong", text="Status:").find_all_next(text=True)[10],
          'Colour': p.find("strong", text="Status:").find_all_next(text=True)[13],
          'Speeds': p.find("strong", text="Status:").find_all_next(text=True)[16]
        }
        #print entry
        entries.append(entry)

    return entries
Ejemplo n.º 6
0
def parse_company_urls(html):
    soup = BeautifulSoup(html, 'lxml')
    pages = [{
        'company': div.a.text,
        'url': div.a.attrs['href']
    } for div in soup.div(class_='mp_cassette_title')]
    return pages
Ejemplo n.º 7
0
def function1(link3):
	'''returns all the page to be scrap of a particular company http://www.gsmarena.com/samsung-phones-9.php
	It will extract all the navigation link present on the bottom of the page'''
	data2 = requests.get(link3)
	soup3 = BeautifulSoup(data2.text)
	soup4= BeautifulSoup(str(soup3.div(class_="nav-pages")))
	if soup4.get_text() == '[]': #some pages have no navigation pages thats why if is used
		print link3
		function2(link3)
		print "----------------------------------------------"
	else:
		print link3
		function2(link3)
		link1=  "http://www.gsmarena.com/"+ soup4.a['href']
		print link1
		function2(link1)
		for links in soup4.find_all('a'):
			link2 = "http://www.gsmarena.com/" + links['href']
			if link2 == link1:
				pass
			else:
				link5 = "http://www.gsmarena.com/" + links['href']
				print link5
				function2(link5)
		print "-----------------------------------------------------------------"
Ejemplo n.º 8
0
def main():
    target = "https://www.biqubao.com/book/17570/"
    save_path = 'E:/' \
                ''
    index_path = 'https://www.biqubao.com'
    req = requests.get(url=target)
    req.encoding = 'gbk'
    # gbk是网站编码方式
    soup = BeautifulSoup(req.text, "html.parser")
    list_tag = soup.div(id="list")
    print(type(list_tag))
    #find title
    title = list_tag[0].dl.dt.string
    path = save_path + '/' + title
    if not os.path.exists(path):
        os.path.join(save_path, title)
        os.mkdir(path)
    print(1)

    for tag in list_tag[0].dl.find_all('dd'):
        chapter_name = tag.string
        print(2)
        chapter_url = index_path + tag.a.get("href")
        chapter_req = requests.get(url=chapter_url)
        chapter_req.encoding = "gbk"
        chapter_soup = BeautifulSoup(chapter_req.text, "html.parser")
        text = chapter_soup.div.find(id="content")
        print(type(text))
        content_text = str(text.text.replace('\xa0', '\n'))
        with open(path + '/' + chapter_name + '.txt', 'w') as f:
            f.write('本文网址:' + chapter_url)
            f.write(content_text)
Ejemplo n.º 9
0
def function1(link3):
    '''returns all the page to be scrap of a particular company http://www.gsmarena.com/samsung-phones-9.php
	It will extract all the navigation link present on the bottom of the page'''
    data2 = requests.get(link3)
    soup3 = BeautifulSoup(data2.text, "lxml")
    soup4 = BeautifulSoup(str(soup3.div(class_="nav-pages")), "lxml")
    if soup4.get_text(
    ) == '[]':  #some pages have no navigation pages thats why if is used
        #print("NoNav")
        #print (link3)
        function2(link3)
        #print ("-------------------No1--------------------------")
    else:
        #print("Nav")
        #print (link3)
        function2(link3)
        link1 = "http://www.gsmarena.com/" + soup4.a['href']
        #print (link1)
        function2(link1)
        for links in soup4.find_all('a'):
            link2 = "http://www.gsmarena.com/" + links['href']
            if link2 == link1:
                pass
            else:
                link5 = "http://www.gsmarena.com/" + links['href']
                print(link5)
                function2(link5)
Ejemplo n.º 10
0
def get_url(url):
    print('GET URL')
    try:
        contents = urllib.request.urlopen(url).read()
        page = urllib.request.urlopen(url)
        soup = BeautifulSoup(page, "html5lib")
        print('CONTENIDOS:', contents)
        print('PAGINA:', page)

        # Título
        titulo = soup.title.string
        print('TITULO:', titulo)
        # print('PARSE:', soup.div(id='contenedor_central'))
        # print('PARSE:', soup.div(id='principal'))
        # print('PARSE:', soup.div(id='fecha_creditos'))
        # Fecha
        fecha = soup.div(id='fecha_actividad')
        print('FECHA:', fecha)
        # Tipo
        tipo = soup.div(id='online')
        print('TIPO:', tipo)
        # Centro
        centro = soup.div(id='centro')
        print('CENTRO:', centro)

        # Ponentes
        ponentes = soup.find_all(['a'], href=re.compile('idponente'))
        print('PONENTES:', ponentes)

        exit(0)

        print('PARSE:', soup.div(id='actividad'))
        print('PARSE0:',
              soup.find_all(['div'], attrs={"class": "contenedor_actividad"}))
        print(
            'PARSE0:',
            soup.find_all(['div'], attrs={"class":
                                          'cabeceraDetalleActividad'}))
        print('PARSE0:',
              soup.find_all(['div'], attrs={"class": 'cajasActividad'}))
        #print('PARSE LIMPIO:', soup.prettify())

        return soup.title.string
    except Exception as e:
        return e
Ejemplo n.º 11
0
def function2(phn_links):
	'''return all the phone links of the page ie http://www.gsmarena.com/amazon-phones-76.php'''
	#phn_links = "http://www.gsmarena.com/amazon-phones-76.php"
	phn_links_page = requests.get(phn_links)
	phn_soup = BeautifulSoup(phn_links_page.text)
	phn_soup2 = BeautifulSoup(str(phn_soup.div(class_="makers")))
	for link in phn_soup2.find_all('a'):
		link = "http://www.gsmarena.com/" + link['href']
		function3(link)
Ejemplo n.º 12
0
    def fetch_data():
        """
        从小说网站直接获取小说
        :return:
        """
        # 想要爬取的小说主页
        target = "https://www.biqubao.com/book/17570/"
        # 本地保存的路径
        save_path = "E:/爬虫练习/spider_data/small_story"

        # 想要爬取网站的根路径
        index_path = "https://www.biqubao.com"

        req = requests.get(target)
        # 查看request默认的编码,发现与网站response不符,改为网站使用的gbk
        print(req.headers)
        exit(0)
        print(req.encoding)
        req.encoding = 'gbk'

        # 解析HTML
        soup_object = BeautifulSoup(req.text, "html.parser")
        list_tag = soup_object.div(id="list")
        # 打印每个章节小说的路径的链接元素
        print('list_tag:', list_tag)

        # 获取小说名称
        story_title = list_tag[0].dl.dt.string
        # 根据小说名称创建一个文件夹,如果不存在就创建
        dir_path = save_path + '/' + story_title
        if not os.path.exists(dir_path):
            os.path.join(save_path, story_title)
            os.mkdir(dir_path)

        i = 0
        # 循环获取每一个章节,获取章节名称,与章节对应的网址
        for ddTag in list_tag[0].dl.find_all('dd'):
            i += 1
            # 章节名称
            chapter_name = ddTag.string
            # 章节网址
            chapter_url = index_path + ddTag.a.get('href')
            # 访问该章节详情网址,爬取该章节正文
            chapter_req = requests.get(url=chapter_url)
            chapter_req.encoding = 'gbk'
            chapter_soup = BeautifulSoup(chapter_req.text, "html.parser")

            # 解析正文所在的标签:
            content_tag = chapter_soup.div.find(id="content")
            # 获取正文文本,并将空格替换为换行符
            content_text = str(content_tag.text.replace('\xa0', '\n'))

            # 将当前章节,写入以章节名字命名的txt文件
            with open(dir_path + '/' + chapter_name + '.txt', 'w') as f:
                f.write('本文网址: ' + chapter_url)
                f.write(content_text)
Ejemplo n.º 13
0
def topaqu(target, index_path, type):
    global q, all_novel
    try:
        req = requests.get(url=target)
        # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk
        print(req.encoding)
        req.encoding = 'gbk'
        # 解析html
        soup = BeautifulSoup(req.text, "html.parser")
        list_tag = soup.div(id="list")
        if len(list_tag) < 1:
            list_tag = soup.findAll(name="div", attrs={"class": "listmain"})
        print('list_tag:', list_tag)
        # 获取小说名称
        story_title = list_tag[0].dl.dt.string
        # 根据小说名称创建一个文件夹,如果不存在就新建
        dir_path = save_path + '/' + story_title
        if not os.path.exists(dir_path):
            os.path.join(save_path, story_title)
            os.mkdir(dir_path)
        # 开始循环每一个章节,获取章节名称,与章节对应的网址
        q = collections.deque()
        num = 0
        for dd_tag in list_tag[0].dl.find_all('dd'):
            num = num + 1
            # 章节名称
            chapter_name = dd_tag.string
            # 章节网址
            chapter_url = index_path + dd_tag.a.get('href')
            # novelList[str(dd_tag.a.get('href')).split("/")[-1].split(".")[0]] = chapter_name + ';' + chapter_url
            print(str(dd_tag.a.get('href')).split("/")[-1].split(".")[0])
            print(chapter_name + ';' + chapter_url)
            q.append(
                str(dd_tag.a.get('href')).split("/")[-1].split(".")[0] + ';' +
                chapter_name + ';' + chapter_url)
        print("一共_____:" + str(num))
        threads = []
        all_novel = {}
        for i in range(0, 6):
            t = threading.Thread(target=get_zj, args=(i, ))
            threads.append(t)
            t.start()
        for j in threads:
            j.join()
        print("********所有线程执行完毕************")
        novel = sorted(all_novel.items(), key=lambda x: x[0])
        txt = open(dir_path + '/' + story_title + '.txt',
                   'a',
                   encoding="utf-8")
        for a in novel:
            # 将当前章节,写入以章节名字命名的txt文件
            txt.write(a[1])
            txt.write('\n')
        txt.close()
    except Exception as e:
        print("发送异常" + str(e))
Ejemplo n.º 14
0
def add_column_timestamp(db_conn, alter_table=False):
    """
	Agrega columna timestamp a la tabla según la fecha
	de consumo parseada del url_review del usuario
	"""

    c = db_conn.cursor()

    table_name = 'user_reviews'
    col_timestamp = 'timestamp'
    reviews_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/"

    if alter_table:
        c.execute("ALTER TABLE {0} ADD COLUMN {1} {2}".format(
            table_name, col_timestamp, 'INTEGER'))

    c.execute("SELECT * FROM {0}".format(table_name))
    all_rows = c.fetchall()

    i = 0
    for tupl in all_rows:
        logging.info(
            "-> Viendo tupla {0} de {1}. Usuario: {2}, Review: {3}".format(
                i, len(all_rows), tupl[0], tupl[1]))
        i += 1

        try:
            with open(reviews_path + tupl[1] + '.html', 'r') as fp:
                soup = BeautifulSoup(fp, 'html.parser')
        except Exception as e:
            logging.info("No se pudo abrir HTML {0}. Error: {1}".format(
                tupl[1], e))
            continue

        try:
            date = int(
                soup.div(class_='dtreviewed')[0].find_all(
                    'span', class_='value-title')[0]['title'].replace('-', ''))
        except Exception as e:
            logging.info("No se pudo parsear fecha")
            continue

        try:
            c.execute( "UPDATE {0} SET {1} = '{2}' WHERE user_id = {3} AND url_review = '{4}'"\
             .format( table_name,
                  col_timestamp,
                  date,
                  tupl[0],
                  tupl[1] ))
        except sqlite3.IntegrityError:
            logging.info('ERROR ACTUALIZANDO VALORES'.format(file_name))
            continue

    db_conn.commit()
Ejemplo n.º 15
0
 def crawler_trending(self, href):
     t=''.join([baseURL, '/trending?', href])
     print t
     try:
         req=urllib2.urlopen(t)
         result=req.read()
         soup=BeautifulSoup(result)
         for d in soup.div():
             if d.attrs.has_key('class') and 'leaderboard-list-content' in d.attrs['class']:
                 repos=Repository(d.a['href'])
                 self.userQueue.put(User('/'+repos.user))
     except urllib2.URLError as e:
         print e.reason
Ejemplo n.º 16
0
def fetch_intern_dates(pageDict):
    interns = []
    try:
        html = urllib.request.urlopen(url=pageDict['url'])
    except urllib.error.HTTPError as e:
        print(e)
        print('this company has no internship pages')
        # empty list
        return interns
    soup = BeautifulSoup(html, 'lxml')

    # list of internship divs
    internDivs = soup.div(class_='ts-p-_internshipList-item-info')
    prefix = 'ts-p-_internshipList-item-info-row-'
    titleClassName = prefix + 'title'
    daysClassName = ' '.join(
        [prefix + 'detail-text', prefix + 'detail-text_day'])
    dateClassName = ' '.join(
        [prefix + 'detail-text', prefix + 'detail-text_place'])

    # list of deadline divs
    deadlineDivs = soup.div(
        class_='ts-p-_internshipList-item-entry js-p-entryItem-empty')
    deadlineClassName = 'ts-p-_internshipList-item-entry-deadline'

    for iDiv, dDiv in zip(internDivs, deadlineDivs):
        intern = {
            'company': pageDict['company'],
            'title': iDiv.div()[0].text,
            'days': iDiv.find_all('div', class_=daysClassName)[0].text,
            'date': iDiv.find_all('div', class_=dateClassName)[0].text
        }
        intern['deadline'] = re.sub(
            'エントリー締切:', '',
            dDiv.find_all('div', class_=deadlineClassName)[0].text)
        interns.append(intern)

    return interns
Ejemplo n.º 17
0
def scrape_for_vine(query1,query2=""):
	
	url = "https://twitter.com/search/realtime?q=vine.co%2Fv%2F+%2B+"+query1+query2+"&src=typd"
	html = urllib2.urlopen(url).read()
	soup = BeautifulSoup(html)
	vine_url_array=[]
	vine_dict={}

	for instance in soup.find_all('span',{'class' : 'js-display-url'}):
		vine_url = instance.get_text()
		vine_url_array.append(vine_url)
		#print vine_url_array	
	
	for i in vine_url_array:
		i='http://'+i
		soupe = BeautifulSoup( urllib2.urlopen(i).read() )
		link = soupe.source['src']
		title = soupe.p.get_text()
		vine_dict[title]=link
		print soupe.div('class':'user').img['src']


	'''
Ejemplo n.º 18
0
 def crawling_repos_contributors(self, repos, item):
     failure=True
     while failure:
         try:
             print baseURL+repos.href+item
             req=urllib2.urlopen(baseURL+repos.href+item)
             result=req.read()
             soup=BeautifulSoup(result)
             for d in soup.div():
                 if d.attrs.has_key('id') and d.attrs['id']=='contributors':
                     print d
             failure=False
         except urllib2.URLError as e:
             sys.stderr.write('%s when crawling %s' % (e, repos.href+item))
Ejemplo n.º 19
0
    def parse_branch_name(self):
        self.branches=[]
        try:
#            print baseURL+self.target_repos.href
            req=urllib2.urlopen(baseURL+self.target_repos.href)
            result=req.read()
            soup=BeautifulSoup(result)
            for d in soup.div():
                if d.has_attr('class') and 'select-menu-list' in d.attrs['class'] and d.has_attr('data-tab-filter') and d['data-tab-filter']=='branches':
                    for item in d.div():
                        if item.has_attr('class') and 'select-menu-item' in item.attrs['class']:
                            branch=Branch(item.a['href'])
                            self.branches.append(branch)
                            self.logger.info('Branch %s' % branch.branch_name)
        except urllib2.HTTPError, e:
            print e
Ejemplo n.º 20
0
 def crawling_repos_followers(self, repos, item):
     failure=True
     while failure:
         try:
             print baseURL+repos.href+item
             req=urllib2.urlopen(baseURL+repos.href+item)
             result=req.read()
             soup=BeautifulSoup(result)
             for d in soup.div():
                 if d.attrs.has_key('class') and 'follow-list-container' in d.attrs['class']:
                     user=User(d.a['href'])
                         #self.crawler_user(user)
                     self.userQueue.put(user)
                     self.logger.info('Repository:%s Lang:%s %s:%s' % (repos.href, repos.lang, item.split('/')[1], user.user))
             failure=False
         except urllib2.URLError as e:
             sys.stderr.write('%s when crawling %s' % (e, repos.href+item))
Ejemplo n.º 21
0
def add_column_book_url(db_conn, alter_table=False):

    db_conn.row_factory = lambda cursor, row: row[0]

    c = db_conn.cursor()

    table_name = 'user_reviews'
    col_book = 'url_book'
    reviews_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/"

    # Creamos columna que contiene las URL de los libros en la tabla de consumos
    if alter_table:
        c.execute("ALTER TABLE {0} ADD COLUMN {1} {2}".format(
            table_name, col_book, 'TEXT'))

    c.execute("SELECT url_review FROM {0}".format(table_name))
    all_rows = c.fetchall()

    i = 0
    for url_review in all_rows:
        logging.info("Viendo fila {0} de {1}".format(i, len(all_rows)))
        i += 1

        with open(reviews_path + url_review + '.html', 'r') as fp:
            soup = BeautifulSoup(fp, 'html.parser')

        try:
            url_book = soup.div(class_='bookTitle')[0].get('href')
        except Exception as e:
            logging.info("URL DE LIBRO NO ENCONTRADO: {}".format(e))
            logging.info("Encontrado HTML conflictivo: {}".format(url_review))
            with open("non_user_reviews_htmls.txt", 'a+') as f:
                f.write("{0}\n".format(url_review))
            continue

        try:
            c.execute( "UPDATE {0} SET {1} = '{2}' WHERE url_review = '{3}'"\
             .format(table_name,
                 col_book,
                  url_book,
                  url_review))
        except sqlite3.IntegrityError:
            logging.info('ERROR ACTUALIZANDO VALORES'.format(file_name))

    db_conn.commit()
Ejemplo n.º 22
0
    def parse_parent_info(self):
        # crawling the parent commit of current commit
        self.parent_sha_list=[]
        failure=True
        while failure:
            try:
                req=urllib2.urlopen(baseURL+self.href)
                result=req.read()
                soup=BeautifulSoup(result)
                for d in soup.div():
                    if d.has_attr('class') and 'commit-meta' in d['class'] and 'clearfix' in d['class']:
                        for s in d.findAll('span'):
                            for a in s.findAll('a'):
                                if a.has_attr('data-hotkey'):
                                    self.parent_sha_list.append(a['href'].strip().split('/')[-1])
#                            self.parent_sha=a['href'].strip().split('/')[-1]
                failure=False
            except urllib2.HTTPError, e:
                sys.stderr.write('%s when crawling %s\n' % (e, self.href))
Ejemplo n.º 23
0
def load_to_db(item_links, month, year):
    for url in item_links:
        print('[FETCH] Loading an item from \'{0}\'...'.format(url))
        try:
            page = BeautifulSoup(urlopen(url).read(), 'html5lib')
            page.div(class_='StockCodeSrp')[0].strong.extract()
            try:
                dollars = float(page.div(class_='StockCodeSrp')[0].text.strip().replace('$', ''))
            except:
                continue
            if dollars > 8.00 or not page.div(class_='StockCodeDescription'):
                continue
            publisher = page.div(class_='StockCodePublisher')[0].text.strip()
            item = [
                page.div(class_='StockCodeDescription')[0].text,
                str(dollars),
                publisher[publisher.index('\xa0') + 1:],
                url,
                'http://previewsworld.com' +
                page.div(class_='StockCodeImage')[0].a.get('href')
                if page.div(class_='StockCodeImage')[0].a.get('href') else '',
                month,
                year,
            ]
            try:
                Preview.objects.create(
                    name=item[0],
                    dollars=item[1],
                    rubles=usd_to_rub(float(item[1])),
                    publisher=item[2],
                    src_url=item[3],
                    cover_url=item[4],
                    month=item[5],
                    year=item[6],
                )
            except IntegrityError:
                continue
        except IndexError:
            print('[FETCH] Error loading from \'{0}\''.format(url))
Ejemplo n.º 24
0
def cnt_story():
    global _dir_path
    req = requests.get(url=target)
    # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk
    print(req.encoding)
    req.encoding = 'gbk'
    # 解析html
    soup = BeautifulSoup(req.text, "html.parser")
    list_tag = soup.div(id="list")
    #print('list_tag:', list_tag)
    # 获取小说名称
    story_title = list_tag[0].dl.dt.string
    # 根据小说名称创建一个文件夹,如果不存在就新建
    _dir_path = save_path + '/' + story_title
    if not os.path.exists(_dir_path):
        os.path.join(save_path, story_title)
        os.mkdir(_dir_path)
    cnt = len(list_tag[0].dl.find_all('dd'))
    story_content = list_tag[0].dl.find_all('dd')
    print("章节数量:" + str(cnt))
    print(story_content)
    return story_content
Ejemplo n.º 25
0
def DownloadBook(target):
    # 本地保存爬取的文本根路径
    save_path = 'Book'
    # 笔趣阁网站根路径
    global index_path
    req = requests.get(url=target)
    # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk
    print(req.encoding)
    req.encoding = 'gbk'
    # 解析html
    soup = BeautifulSoup(req.text, "html.parser")
    list_tag = soup.div(id="list")
    print('list_tag:', list_tag)
    # 获取小说名称
    story_title = list_tag[0].dl.dt.string
    # 根据小说名称创建一个文件夹,如果不存在就新建
    dir_path = save_path + '/' + story_title
    if not os.path.exists(dir_path):
        os.path.join(save_path, story_title)
        os.mkdir(dir_path)
    # 开始循环每一个章节,获取章节名称,与章节对应的网址
    for dd_tag in list_tag[0].dl.find_all('dd'):
        # 章节名称
        chapter_name = dd_tag.string
        # 章节网址
        chapter_url = index_path + dd_tag.a.get('href')
        # 访问该章节详情网址,爬取该章节正文
        chapter_req = requests.get(url=chapter_url)
        chapter_req.encoding = 'gbk'
        chapter_soup = BeautifulSoup(chapter_req.text, "html.parser")
        # 解析出来正文所在的标签
        content_tag = chapter_soup.div.find(id="content")
        # 获取正文文本,并将空格替换为换行符
        content_text = str(content_tag.text.replace('\xa0', '\n'))
        # 将当前章节,写入以章节名字命名的txt文件
        with open(dir_path + '/' + chapter_name + '.txt', 'w') as f:
            f.write('本文网址:' + chapter_url)
            f.write(content_text)
Ejemplo n.º 26
0
def detail_page(response,pattern):
	""""获取岗位要求和岗位工作内容"""
	soup = BeautifulSoup(response,'lxml') #构造soup
	target_tag = soup.div(class_ = "pos-ul")#截取div class:"pos-ul"标签
	if target_tag:
		tag_str = target_tag[0].get_text()
		tag_str = pattern.sub('-',tag_str)
		job_describe = tag_str
		#print(job_describe)
		job_demand = ''
		#获取工资 最高-最低
		salary_str = soup.strong.text
		if "面议" in salary_str:
			salary_down = 0
			salary_up = 0
		elif "以上" in salary_str:
			salary_down = re.sub("\D","",salary_str)
			salary_up = 0
		elif "以下" in salary_str:
			salary_down = 0
			salary_up = re.sub("\D","",salary_str)
		else:
			salary_num = salary_str.find('-')
			salary_down = int(re.sub("\D","",salary_str[:salary_num]))
			salary_up = int(re.sub("\D","",salary_str[salary_num:]))
		job_detail = [job_describe,job_demand,salary_down,salary_up]
	else:
		job_detail = ['','',0,0]

	try:
		industry = soup.select('ul.promulgator-ul  a')[0].text
		job_detail.append(industry)
	except Exception as e:
		job_detail.append('未知')
		print(e)
	return job_detail
Ejemplo n.º 27
0
def extract(dirname):
        processed_content = []
        with open("/home/liki/old/warehouse/complete_info") as f:
            lines = f.readlines()
        for i in range(0,len(lines)):
            processed_content.append(lines[i].split('\t')[0])
    	f.close()
        os.chdir(dirname)	
#	target = open("/home/liki/old/warehouse/complete_info",'a+')
#	html = BeautifulSoup(open(filename))
	for filename in os.listdir("."):
		print filename
                if filename in processed_content:
                    continue;
                else:
                    html = BeautifulSoup(open(filename))
                    target = open("/home/liki/old/warehouse/complete_info",'a+')
                    if len(html.find_all(attrs={"class": "dv-meta-info size-small"}))!=0:
                    #tabular html page
                            table = html.find("table")
                            row = table.findAll('tr')
                            #movie_name
                            movie_name = html.h1.contents[0].strip()
                            #Starring info
                            #	if html.find_all(attrs={"class": "dv-meta-info size-small"})[0].dt.string == "Starring:" :
                            #	starring = html.find_all(attrs={"class": "dv-meta-info size-small"})[0].dd.string.strip()
                            #else:
                            #	starring = "NULL_STARRING"
                            #include format/genre/releasetime
                            header = []
                            content = []
                            for chld in row:
                                    header.extend(chld.findAll('th'))
                                    content.extend(chld.findAll('td'))
                            for i in range(0,len(header)):
                                    header[i] = (''.join(x for x in (header[i].findAll(text = True)))).split(',')
                                    content[i] = (''.join(x for x in (content[i].findAll(text = True)))).split(',')
                                    for k in range(len(header[i])):
                                            header[i][k] = (header[i][k].strip('\n')).strip()
                                    for k in range(len(content[i])):
                                            content[i][k]  = (re.sub(r'\n','',content[i][k].strip('\n'))).strip()
                            tmp=[]
                            for elem in header:
                                    tmp.append(elem[0])
                            header = tmp

                            if ("Genres" in header):
                                    i = header.index("Genres")
                                    genres = ','.join(content[i])
                            else:
                                    genres = "NULL_GENRE"
                            if ("Director" in header):
                                    i = header.index("Director")
                                    director = ','.join(content[i])
                            else:
                                    director = "NULL_DIRECTOR"
                            if ("Starring" in header):
                                    i = header.index("Starring")
                                    starring = ','.join(content[i])
                            else:
                                    starring = "NULL_STARRING"
                            if ("Supporting actors" in header):
                                    i = header.index("Supporting actors")
                                    actor = ','.join(content[i])
                            else:
                                    actor = "NULL_actor"
                            if ("Format" in header):
                                    i = header.index("Format")
                                    movie_format = ','.join(content[i])
                            else:
                                    movie_format = "NULL_FORMAT"
                            if ("time" in header):
                                    time = ','.join(content[i])
                            else:
                                    time = "NULL_SHOWTIME"
            #	target.write(filename+'\t'+movie_name+'\t'+genres+'\t'+director+'\t'+starring+'\t'+actor+'\t'+movie_format+'\t'+time)
            #        target.write('\n')
            #	target.close()
            #	return header,content	
                    else:
                    ##normal kind of page
                            #movie name 
                            if len(html.div(id = "titleSection")) > 0:
                                    movie_name = html.div(id = "titleSection")[0].find(id = "productTitle").string
                                    #Format Info
                                    if len(html.div(id = "byline")) > 0 :
                                            index = len(html.div(id = "byline")[0].find_all("span"))-1
                                            if index > 0:
                                                    movie_format = html.div(id = "byline")[0].find_all("span")[index].string or u""
                                                    #lead actor
                                                    if len(html.div(id = "byline")[0].find_all("span")[0]("span")) > 0:
                                                            if ("Actor" in  html.div(id = "byline")[0].find_all("span")[0]("span")[1].string):
                                                                    starring = html.div(id = "byline")[0].find_all("span")[0].a.string or u"NULL_STARRING"
                                                            else:
                                                                    starring = "NULL_STARRING"
                                                    else:
                                                            starring = "NULL_STARRING"
                                            else:
                                                    movie_format = "NULL_FORMAT"
                                                    starring = "NULL_STARRING"
                                    else:
                                            movie_format = "NULL_FORMAT"
                                            starring = "NULL_STARRING"
                            else:        #other info
                                    movie_name = "NULL_MOVIENAMW"
                                    movie_format = "NULL_FORMAT"
                                    starring = "NULL_STARRING"
                            if html.find("div",{"id":"detail-bullets"}) != None:
                                    tag = html.find("div",{"id":"detail-bullets"}).findAll('li')
                                    header = []
                                    for elem in tag:
                                            header.append(elem.b.string)
                                    
                                    if "Actors:" in header:
                                            i = header.index("Actors:")
                                            tmp_1=tag[i].find_all("a")
                                            tmp = [elem.string for elem in tmp_1]
                                            actor = ",".join(tmp) or u"NULL_actor"
                                    else:
                                            actor = "NULL_actor"
                                    if "Directors:" in header:
                                            i = header.index("Directors:")
                                            tmp_1=tag[i].find_all("a")
                                            tmp = [elem.string for elem in tmp_1]
                                            if tmp != [None]:
                                                director = ",".join(tmp)
                                            else:
                                                director = "NULL_DIRECTOR"
                                    else:
                                            director = "NULL_DIRECTOR"
                                            #	for x in tag[i].find_all("a"):
                                            #		director = director + x.string + ","
                                    #tag[i].a.string 
                                    if ("DVD Release Date:" in header) or ("VHS Release Date:" in header):
                                            if ("DVD Release Date:" in header):
                                                i = header.index("DVD Release Date:")
                                            else:
                                                i = header.index("VHS Release Date:")
                                            tmp_1=str(tag[i])
                                            mm=re.compile('>(.*?)<',re.S)
                                            tmp = mm.findall(tmp_1)
                                            time = (tmp[len(tmp)-1]).strip() or u"NULL_SHOWTIME"
                                    else:
                                            time = "NULL_SHOWTIME"
                                    #if "VHS Release Date:" in header:
                                     #       i = header.index("VHS Release Date:")
                                      #      tmp_1=str(tag[i])
                                       #     mm=re.compile('>(.*?)<',re.S)
                                        #    tmp = mm.findall(tmp_1)
                                         #   time = (tmp[len(tmp)-1]).strip() or u"NULL_SHOWTIME"
                                    #else:
                                    #        time = "NULL_SHOWTIME"
                            else:
                                    actor = "NULL_actor"
                                    director = "NULL_DIRECTOR"
                                    time = "NULL_SHOWTIME"
                            #else:
                            #        movie_name = "NULL_MOVIENAMW"
                            #        movie_format = "NULL_FORMAT"
                            #        actor = "NULL_actor"
                            #        director = "NULL_DIRECTOR"
                            #        time = "NULL_SHOWTIME"
                            #        starring = "NULL_STARRING"
                            #        time = "NULL_SHOWTIME"
                            genres = "NULL_GENRE"
                    target.write(filename+'\t'+movie_name+'\t'+genres+'\t'+director+'\t'+starring+'\t'+actor+'\t'+movie_format+'\t'+time)
                    target.write('\n')
		target.close()
Ejemplo n.º 28
0
def get_url_actividades(url):
    print('GET URL ACTIVIDADES')
    print('###################')
    try:
        contents = urllib.request.urlopen(url).read()
        page = urllib.request.urlopen(url)
        soup = BeautifulSoup(page, "html5lib")
        print('CONTENIDOS:', contents)
        print('PAGINA:', page)

        # Título
        titulo = soup.title.string
        print('TITULO:', titulo)
        # print('PARSE:', soup.div(id='contenedor_central'))
        # print('PARSE:', soup.div(id='principal'))
        # print('PARSE:', soup.div(id='fecha_creditos'))

        # Actividades
        actividades = soup.find_all(['a'],
                                    href=re.compile('actividad/idactividad'))
        #print('ACTIVIDADES:', actividades)
        for actividad in actividades:
            # ID actividad
            link = actividad.get('href')
            idregistro = actividad.get('href').split('/')[3]
            # Actividad
            titulo = actividad.get_text()
            print('ACTIVIDAD:', idregistro, titulo, link)

            # DIV actividad
            actividad_completa = soup.find(['div'],
                                           idregistro=re.compile(idregistro))
            # Fecha
            fecha = "'Sólo disponible en los tres primeros cursos del listado'"
            print('FECHA:', fecha)
            # Tipo
            tipo = "'Sólo disponible en los tres primeros cursos del listado'"
            print('TIPO:', tipo)
            # Centro
            centro = actividad_completa.find(
                ['a'], href=re.compile('indice/idcentro')
            )  # actividad.find_all(['a'], href=re.compile('indice/idcentro'))
            centro = centro.get_text()
            print('CENTRO:', centro)

            # WEB CURSO
            get_url_curso(web_extension + link)

        print('PARSE0:',
              soup.find_all(['div'], attrs={"class": "lista_mas_actividades"}))

        exit(0)

        print('PARSE:', soup.div(id='actividad'))
        print('PARSE0:',
              soup.find_all(['div'], attrs={"class": "contenedor_actividad"}))
        print(
            'PARSE0:',
            soup.find_all(['div'], attrs={"class":
                                          'cabeceraDetalleActividad'}))
        print('PARSE0:',
              soup.find_all(['div'], attrs={"class": 'cajasActividad'}))
        #print('PARSE LIMPIO:', soup.prettify())

        return soup.title.string
    except Exception as e:
        return e
Ejemplo n.º 29
0
if __name__=='__main__':
    #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可
    target="https://www.biqubao.com/book/17570/"
    # 本地保存爬取的文本根路径
    save_path = 'd:/'
    #笔趣阁网站根路径
    index_path='https://www.biqubao.com'

    req=requests.get(url=target)
    #查看request默认的编码,发现与网站response不符,改为网站使用的gdk
    print(req.encoding)
    req.encoding = 'gbk'
    #解析html
    soup=BeautifulSoup(req.text,"html.parser")
    list_tag=soup.div(id="list")
    print('list_tag:',list_tag)
    #获取小说名称
    story_title=list_tag[0].dl.dt.string
    # 根据小说名称创建一个文件夹,如果不存在就新建
    dir_path=save_path+'/'+story_title
    if not os.path.exists(dir_path):
        os.path.join(save_path,story_title)
        os.mkdir(dir_path)
    #开始循环每一个章节,获取章节名称,与章节对应的网址
    for dd_tag in list_tag[0].dl.find_all('dd'):
        #章节名称
        chapter_name=dd_tag.string
        #章节网址
        chapter_url=index_path+dd_tag.a.get('href')
        #访问该章节详情网址,爬取该章节正文
Ejemplo n.º 30
0
def appmain(request):
    youtubeUrl = random.choice(youtube_list_selectOne)
    zd = a = tdColor = character = luckyNum = zdName = ''
    t3 = 'https://www.youtube.com/'
    bgColor = bgColorList[bgColorNum[0]]
    totalNum = 0
    if request.method == "GET":
        form = PostForm(request.GET)
        if form.is_valid():
            zd = form.cleaned_data["zodiac"]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]
        if zd == '':
            zdTotal = zdLove = zdMoney = zdWork = ''
        elif zd == 'zodiac1':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/aries_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/aries'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/ohitsuji/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac2':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/taurus_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/taurus'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/oushi/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac3':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/gemini_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/gemini'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/hutago/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac4':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/cancer_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/cancer'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/kani/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac5':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/leo_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/leo'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/shishi/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac6':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/virgo_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/virgo'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/otome/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac7':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/libra_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/libra'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/tenbin/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac8':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/scorpio_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/scorpio'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/sasori/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac9':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/sagittarius_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/sagittarius'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/ite/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac10':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/capricorn_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/capricorn'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/yagi/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac11':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/aquarius_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/aquarius'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/mizugame/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        elif zd == 'zodiac12':
            character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/pisces_banner.jpg'
            zdName = zdNameDic[zd]
            url1 = 'https://fortune.yahoo.co.jp/12astro/pisces'
            req1 = urllib.request.Request(url1)
            response1 = urllib.request.urlopen(req1)
            html1 = response1.read()
            soup1 = BeautifulSoup(html1, "lxml")
            images = soup1.find_all('img')
            for img in images:
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot'
                ):
                    zdTotalNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdTotal = fortuneList[int(zdTotalNum)]

                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov'
                ):
                    zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdLove = fortuneList[int(zdLoveNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny'
                ):
                    zdMoneyNum = int(re.search(r'\d+',
                                               img['src']).group()) / 10
                    zdMoney = fortuneList[int(zdMoneyNum)]
                if img['src'].startswith(
                        'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk'
                ):
                    zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10
                    zdWork = fortuneList[int(zdWorkNum)]

            url2 = 'https://uranai.nifty.com/f12seiza/uo/'
            req2 = urllib.request.Request(url2)
            response2 = urllib.request.urlopen(req2)
            html2 = response2.read()
            soup2 = BeautifulSoup(html2, "lxml")
            a = str(soup2.div(class_='hako'))  #今日の得点を取得
            totalNum = int(re.search(r'\d+', a).group())  #得点を抽出
            bgColor = bgColorList[bgColorNum[totalNum % 6]]
            tdColor = bgColorNum[totalNum % 6]
            luckyNum = totalNum % 9 + 1
            if totalNum < 50:
                youtubeUrl = youtube_list_selectOne[
                    (luckyNum + 123) % len(youtube_list_selectOne)]
            else:
                youtubeUrl = youtube_list_selectTwo[
                    (luckyNum + 123) % len(youtube_list_selectTwo)]
            t = youtubeUrl.split('watch?v=')
            t1 = t[0] + 'embed/'
            t2 = t[1].split('&')
            t3 = t1 + t2[0]

        return render(
            request, 'demo/appname.html', {
                'url': t3,
                'zdTotal': zdTotal,
                'zdLove': zdLove,
                'zdMoney': zdMoney,
                'zdWork': zdWork,
                'bgColor': bgColor,
                'tdColor': tdColor,
                'character': character,
                'luckyNum': luckyNum,
                'zdName': zdName
            })
Ejemplo n.º 31
0
def create_user_reviews_table(path_jsons, db_conn):
    """
	Recibe direccion de directorio de documentos JSON 
	y el objeto de la conexión de la BD 
	"""

    c = db_conn.cursor()

    # Creacion de la tabla en la BD: user_reviews(user_id, url_review, rating)
    table_name = 'user_reviews'
    col_user_id = 'user_id'
    col_url = 'url_review'
    col_rating = 'rating'

    c.execute( 'CREATE TABLE IF NOT EXISTS {0} ({1} {2}, {3} {4} PRIMARY KEY, {5} {6})'\
    .format(table_name, \
        col_user_id, 'INTEGER', \
        col_url, 'TEXT', \
        col_rating, 'INTEGER') )

    # Listando el contenido del directorio <path_jsons>/
    json_titles = [
        f for f in listdir(path_jsons) if isfile(join(path_jsons, f))
    ]

    for i in range(0, len(json_titles)):

        with open(path_jsons + json_titles[i], 'r') as f:
            # Recuperando toda la info del documento
            data_json = json.load(f)

        for j in range(0, len(data_json)):
            # Guardando texto del tweet
            tweet = data_json[j]['text']

            # Guardando URL de la opinion del usuario en GR
            try:
                url_review = data_json[j]['entities']['urls'][-1][
                    'expanded_url']
                url_review = unshorten_url(url_review)
            except Exception as e:
                logging.info("¡Tweet con contenido NO predefinido!")
                continue

            # Guardando username del usuario en Twitter
            screen_name = data_json[j]['user']['screen_name']

            # Guardando ID del usuario en Twitter
            user_id = data_json[j]['user']['id']

            logging.info(
                "Obteniendo HTML del Tweet {1}/{2}. Usuario: {0}, {3}/{4}.".
                format(screen_name, j, len(data_json), i, len(json_titles)))

            # Guardando en disco el HTML crawleado de url_review
            file_name = url_review.split('/')[
                -1]  # Cortamos después del último '/' de la URL
            file_name = file_name.split('?')[
                0]  # Cortamos después del primer '?' de la URI
            save_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/" + file_name + ".html"

            # Intentando ingresar a la URL
            # Si no es accesible o si no corresponde a ruta de GR,
            # sigue con el próximo tweet
            if "goodreads.com/review" in url_review:
                try:
                    urllib.request.urlretrieve(url_review, save_path)
                except Exception as e:
                    logging.info("No se pudo ingresar al sitio!")
                    continue
            else:
                logging.info("Enlace no es ruta de review de GR")
                continue

            # Abriedo HTML recién guardado para capturar el rating
            with open(save_path) as fp:
                soup = BeautifulSoup(fp, 'html.parser')

            # Guardamos el rating
            # A veces en GR no se renderiza el HTML que incluye el rating (why? dunno),
            # pero sí está el rating puesto en el Tweet ("1 out of 5 stars to [...]")..
            # ..en esos casos se usa un regex para capturar el rating desde el texto del tweet.
            # Si todo falla guardamos el rating como 0, sólo indicando que el usuario
            # consumió aquel item (presuponiendo de que si aparece la URL del review en el tweet
            # es porque el item fue consumido)
            try:
                rating = int(
                    soup.div(class_='rating')[0].find_all(
                        'span', class_='value-title')[0]['title'])
            # En caso que no encuentre rating en la ruta del review (porque no puede encontrar
            # la rewview o porque no hay estrellitas donde debiera estar el rating)...
            except Exception as e:
                try:
                    #..lo capturo con un regex desde el tweet
                    match = re.search(r"(\d+) of (\d+) stars", tweet.lower())
                    rating = int(match.group(1))
                    if rating > 5 or rating < 0:
                        rating = 0
                except Exception as er:
                    rating = 0

            # Insertando tupla (user_id, url_review, rating) en la BD
            try:
                c.execute( "INSERT INTO {0} ({1}, {2}, {3}) VALUES (?, ?, ?)" \
                 .format(table_name, col_user_id, col_url  , col_rating), \
                            (user_id    , file_name, rating) )
            except sqlite3.IntegrityError:
                logging.info(
                    'ERROR: URI de review ya existe: {}'.format(file_name))

        # Manda los cambios al final de pasar por todos los tweets de cada usuario
        db_conn.commit()
Ejemplo n.º 32
0
from bs4 import BeautifulSoup
import requests

html = requests.get('https://ip.cn/').content
print(html)
soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8')
result = soup.div(id='result')[0].p.code.get_text()
print(result)
Ejemplo n.º 33
0
if __name__ == '__main__':
    #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可
    target = "https://www.biqubao.com/book/9062/"
    # 本地保存爬取的文本根路径
    save_path = 'F:/P'
    #笔趣阁网站根路径
    index_path = 'https://www.biqubao.com'

    req = requests.get(url=target)
    #查看request默认的编码,发现与网站response不符,改为网站使用的gdk
    print(req.encoding)
    req.encoding = 'gbk'
    #解析html
    soup = BeautifulSoup(req.text, "html.parser")
    list_tag = soup.div(id="list")
    print('list_tag:', list_tag)
    #获取小说名称
    story_title = list_tag[0].dl.dt.string
    # 根据小说名称创建一个文件夹,如果不存在就新建
    dir_path = save_path + '/' + story_title
    if not os.path.exists(dir_path):
        os.path.join(save_path, story_title)
        os.mkdir(dir_path)
    #开始循环每一个章节,获取章节名称,与章节对应的网址

    for dd_tag in list_tag[0].dl.find_all('dd'):
        #章节名称
        chapter_name = dd_tag.string

        #章节网址
Ejemplo n.º 34
0
def crawling_branch(branch, baseURL, local_repos_dir):
    sys.stderr.write('%s %s %s\n' % (branch.branch_name, baseURL, local_repos_dir))
#    logging.basicConfig(filename='crawler-threadpool.log', level = logging.DEBUG, format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s')
    logger=logging.getLogger('-'.join(['Branch', branch.branch_name]))

    if os.path.isdir(os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))):
        os.system(' '.join(['rm', '-rf', os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))]))
    sys.stderr.write('Start parsing %s\n' % branch.branch_name)
    os.mkdir(os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~')))
    os.system(' '.join(['git', 'clone', '-b', branch.branch_name, baseURL+branch.repos.href, os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))]))
    N=test_last_page(baseURL+branch.commit_url)
    fp=open(os.path.join(local_repos_dir, 'logs', branch.branch_name.replace('/', '~')), 'w')
    logger.info('Total pages:%s' % N)
    visit_commit_set=set()
    for i in range(N, 0, -1):
        sys.stderr.write('Branch:%s\tPage:%s\n' % (branch, i))
        failure=True
        while failure:
            try:
                req=urllib2.urlopen(baseURL+branch.commit_url+'?page='+str(i))
                result=req.read()
                soup=BeautifulSoup(result)
                commit_list=[]
                for d in soup.div():
                    if d.has_attr('class') and 'js-navigation-container' in d.attrs['class']:
                        h3_list=d.findAll('h3')
                        ol_list=d.findAll('ol')
                        if len(h3_list)==len(ol_list):
                            for index in range(len(h3_list)):
                                h3_date=datetime.datetime.strptime(h3_list[index].string, '%b %d, %Y').date()
                                for li in ol_list[index].findAll('li'):
                                    for c_a in li.p.findAll('a', {'class':'message'}):
                                        commit=Commit(c_a['href'], h3_date)
                                        if commit.commit_sha not in visit_commit_set:
#                                        sys.stderr.write('Parent info %s\n' % '\t'.join(commit.parent_sha_list))
                                            commit_list.append(commit)
                                            visit_commit_set.add(commit.commit_sha)
#                                        if i==N:
#   #                                       tracing_parent=[commit]
#    #                                       while len(tracing_parent)>0:
#    #                                           com=tracing_parent.pop()
#    #                                           com.parse_parent_info()
#    #                                           for parent_sha in com.parent_sha_list:
#    ##                                               print 'Deep parent %s' % parent_sha
#    #                                               if parent_sha not in visit_commit_set:
#    #                                                   parent_commit=Commit(os.path.join(branch.repos.href, 'commit', parent_sha), datetime.datetime(2000, 1, 1))
#    ##                                                   parent_commit.parse_parent_info()
#    #                                                   commit_list.append(parent_commit)
#    #                                                   visit_commit_set.add(parent_sha)
#    #                                                   tracing_parent.append(parent_commit)
                        else:
                            print 'Error! h3 and ol do not match!'
                commit_list.reverse()
                for commit in commit_list:
                    commit.parse_parent_info()
                    fp.write('%s %s %s %s\n' % (branch.branch_name, commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), '\t'.join(commit.parent_sha_list)))
                    logger.info('Commit:%s (%s) in Branch:%s Parent:%s' % (commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), branch.branch_name, '\t'.join(commit.parent_sha_list)))
    #                if not os.path.isdir(os.path.join(local_repos_dir, 'previous_commits', commit.commit_sha)):
    #                    os.mkdir(os.path.join(local_repos_dir, 'previous_commits', commit.commit_sha))
    #                    clone_commit(commit, os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~')))
                failure=False
            except urllib2.HTTPError, e:
                print e, baseURL+branch.commit_url+'?page='+str(i)
Ejemplo n.º 35
0
import requests
from bs4 import BeautifulSoup
import os

#小说列表网页以及主网页
target = 'https://www.biqubao.com/book/13991/'
server = 'https://www.biqubao.com'

#注意转化编码
req = requests.get(url=target)
req.encoding = 'gbk'
html = req.text
#找到list
div = BeautifulSoup(html, "html.parser")
list_tag = div.div(id='list')
#小说名
title = list_tag[0].dl.dt.string
#目标文件夹
save_path = 'F:/Python/novel/new'
dir_path = save_path + '/' + title
if not os.path.exists(dir_path):
    os.path.join(save_path, title)
    os.mkdir(dir_path)

for dd_tag in list_tag[0].dl.find_all('dd'):
    #章节名字
    chapter_name = dd_tag.string
    #章节网址
    chapter_url = server + dd_tag.a.get('href')
    c_req = requests.get(url=chapter_url)
    c_req.encoding = 'gbk'
Ejemplo n.º 36
0
def parse(county_page_html):
    soup = BeautifulSoup(county_page_html, 'html5lib')
    block = soup.div(class_='zsg-lg-1-2 zsg-sm-1-1')[0]
    lis = block.find_all("a")
    return [k.text for k in lis]
Ejemplo n.º 37
0
# Import modules
import urllib2
from bs4 import BeautifulSoup
import sqlite3

# Parse box office results
html = urllib2.urlopen("http://www.boxofficemojo.com/weekend/chart/").read()
rawpage = BeautifulSoup((html))
rawbody = rawpage.div(id="body")[0].findAll('tr')[4]
rawtable = rawbody.findAll('tr')[1:44] # Range will change every weekend

rawdata = [x.findAll('td') for x in rawtable]
tempdata = [str(i.string) for x in rawdata for i in x]
findata = [tempdata[x: x+12] for x in range(0, len(tempdata), 12)]

# Create SQL table and enter data
conn = sqlite3.connect("test.db")
c = conn.cursor()

# All data saved as text due to unpredictable use of '-' symbols
c.execute('''create table boxoffice
(tw text, lw text, title text, studio text, weekend_gross text, 
gross_change text, headcount text, headcount_change text,
average text, total_gross text, budget text, week text)
''')

for i in range(len(findata)):
    c.execute('''insert into boxoffice values 
    (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', findata[i])

# Query results and print first row
Ejemplo n.º 38
0
		function2(link1)
		for links in soup4.find_all('a'):
			link2 = "http://www.gsmarena.com/" + links['href']
			if link2 == link1:
				pass
			else:
				link5 = "http://www.gsmarena.com/" + links['href']
				print link5
				function2(link5)
		print "-----------------------------------------------------------------"


def function2(phn_links):
	'''return all the phone links of the page ie http://www.gsmarena.com/amazon-phones-76.php'''
	#phn_links = "http://www.gsmarena.com/amazon-phones-76.php"
	phn_links_page = requests.get(phn_links)
	phn_soup = BeautifulSoup(phn_links_page.text)
	phn_soup2 = BeautifulSoup(str(phn_soup.div(class_="makers")))
	for link in phn_soup2.find_all('a'):
		link = "http://www.gsmarena.com/" + link['href']
		function3(link)

link = "http://www.gsmarena.com/makers.php3" #start link to scrap gsm contains all the phone maker company
data = requests.get(link)
soup = BeautifulSoup(data.text)
soup2= BeautifulSoup(str(soup.div(id="main")))
lis = soup2.find_all('a')
for i in range(0,len(lis),2):
	link2 = "http://www.gsmarena.com/"+ lis[i]['href']
	function1(link2)