Python bs4の例、bs4.bs4 Pythonの例

コード例 #1

1

ファイルを表示

ファイル: scraper.py プロジェクト: slarrain/MachineLearning

def data_per_case (single_case):
    year = single_case['citation']['year']
    docket = single_case['docket_number']
    facts = single_case['facts_of_the_case']
    question = single_case['question']

    f = bs4(facts).text.replace('\n', '').replace('\xa0', '')
    q = bs4(question).text.replace('\n', '').replace('\xa0', '')

    return [year, docket, f, q]

コード例 #2

1

ファイルを表示

ファイル: scrapelr.py プロジェクト: tled/dumplr

 def __SetupPage(self):
     """Inits the bs4 property"""
     page = bs4(self.__FetchPage(self.url),HTML_PARSER) 
     # put iframes into the page object, some blog themes require this
     for frame in page.find_all('iframe'):
         try:
             frame_content = bs4(self.__FetchPage(frame['src']),HTML_PARSER)
             frame.replace_with(frame_content.body)
         except KeyError:
             pass
     self.page = page

コード例 #3

1

ファイルを表示

ファイル: scrape.py プロジェクト: bensw/etrescrape

def main():

	now = str(datetime.datetime.now())
	conn = sqlite3.connect(DIR + "scrape.db")
	conn.row_factory=sqlite3.Row
	
	c = conn.cursor()
	changes = {}
	new_beers = []
	items = []

	# Add beers to items from each url
	for url in URLS:
		soup = bs4(urllib2.urlopen(url).read())
		print "Found %s Items..." % (total_items(soup))
		items += get_items(soup)

		# See if there are multiple pages		
		page = 2
		while (int(total_items(soup)) > len(items)):
			items += get_items(bs4(urllib2.urlopen(url + "&sort=20a&page=%d" % page ).read()))
			page += 1

	# Loop over beers found
	for item in items:
		# See if the beer exists in the database
		entry = c.execute("SELECT * FROM beers WHERE name = ?", [item['name']]).fetchall()
		if (len(entry) == 0): # If it doesn't insert it into the data base 
			c.execute("INSERT INTO beers (last_updated, name, qty, price, etreId) VALUES (?, ?, ?, ?, ?)", [now, item['name'], item['qty'], item['price'], item['etreId']])
			new_beers.append({"name":item['name'], "qty":item['qty'], "price":item['price'], "etreId":item['etreId']})
			# print "New beer found! name: %s qty: %d price: %f" % (item['name'], item['qty'], item['price'])
		elif (len(entry) == 1): # If it does exist
			e = entry[0]
			# Loop over the keys that are important (not id, time, etreId)
			#print e.keys()
			for key in e.keys()[1:-2]: 
				if e[key] != item[key]:
					if item['name'] in changes.keys():
						changes[item['name']][key] = [str(e[key]), str(item[key])]
					else:
						changes[item['name']] = {key:[str(e[key]), str(item[key])], 'etreId': item['etreId']}
			c.execute("UPDATE beers SET name=?, qty=?, price=?, last_updated=?, etreId=? WHERE id = ?", [item['name'], item['qty'], item['price'], now, item['etreId'], entry[0][0]])
		

	# Rendering
	#print changes, new_beers
	render(changes, new_beers)

	# Commit and close the db cursor
	conn.commit()
	conn.close()

コード例 #4

1

ファイルを表示

ファイル: scriptunio.py プロジェクト: jotacor/tradunio

def get_new_players():
    ''' Obtiene los fichajes y ventas de la liga '''
    session = requests.session()
    session.get('http://stats.comunio.es/transfers.php', headers=headers)
    soup = bs4(session.get('http://stats.comunio.es/transfers.php', headers=headers).content)
    new_members = True
    for table in soup.find_all('table', {'class': 'rangliste'}):
        if new_members:
            for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]:
                nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href'])
                number = nuna.group(1)
                name = nuna.group(2).replace("+", " ").strip()
                club = row.find('td', {'class': 'clubPic'}).a['href']
                club_id = re.search("([0-9]+)-(.*)", club).group(1)
                club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ")
                position = _position_translation(row.contents[6].text)
                db.commit_query('INSERT IGNORE INTO players (idp, name, position, idcl) VALUES (%s, "%s", %s, %s)' % (
                number, name, position, club_id))
                get_all_prices(name, incremental=True)
                print 'Alta jugador %s (%s) en el club %s (%s) como %s (%s)' % (
                name, number, club_name, club_id, row.contents[6].text, position)
            new_members = False
        else:
            for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]:
                nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href'])
                number = nuna.group(1)
                name = nuna.group(2).replace("+", " ").strip()
                club = row.find('td', {'class': 'clubPic'}).a['href']
                club_id = re.search("([0-9]+)-(.*)", club).group(1)
                club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ")
                db.commit_query('UPDATE players SET idcl=NULL WHERE idp=%s' % (number))
                print 'Baja jugador %s (%s) del club %s (%s)' % (name, number, club_name, club_id)

コード例 #5

1

ファイルを表示

ファイル: bidding_data.py プロジェクト: emkael/jfrpary-bidding-data

 def __format_bidding(self, bidding):
     """Convert bidding data to properly formatted HTML table."""
     log.getLogger('b_format').debug('formatting bidding: %s', bidding)
     bid_match = re.compile(r'(\d)([SHDCN])')
     html_output = bs4('<table>', 'lxml')
     header_row = html_output.new_tag('tr')
     html_output.table.append(header_row)
     for direction in self.__directions:
         header_cell = html_output.new_tag('th')
         header_cell.string = direction
         header_row.append(header_cell)
     for bid_round in bidding:
         bidding_row = html_output.new_tag('tr')
         html_output.table.append(bidding_row)
         for bid in bid_round:
             bid_cell = html_output.new_tag('td')
             call_match = re.match(bid_match, bid)
             if call_match:
                 bid_cell.append(call_match.group(1))
                 bid_icon = html_output.new_tag(
                     'img', src='images/' + call_match.group(2) + '.gif')
                 bid_cell.append(bid_icon)
             else:
                 if bid == 'SkipBid':
                     bid = '( - )'
                 bid_cell.append(bid)
             bidding_row.append(bid_cell)
         log.getLogger('b_format').debug('%5s' * 4, *bid_round)
     return html_output.table.prettify()

コード例 #6

1

ファイルを表示

ファイル: Module.py プロジェクト: treemo/MoLA-google_lib-module

    def query(self, query):
        result = {
            'result': [],
        }

        http = ModuleManager.call_module_method(
            'http_lib',
            'get',
            'https://encrypted.google.com/search?q=%s&num=10000' % quote(query)
        )

        if not 'html' in http:
            result['error'] = 'No server responce'
            return result

        soup = bs4(http['html'])

        for title in soup.find_all('h3', attrs={'class': 'r'}):
            element = title.find('a')
            # parse google url
            g_url = urlparse(element['href'])
            g_args = parse_qs(g_url.query)
            url = g_args['q'][0]

            result['result'].append({
                'title': element.text,
                'url': url,
            })

        return result

コード例 #7

0

ファイルを表示

ファイル: views.py プロジェクト: lucasbrambrink/ValuationScraping

	def post(self, request):
		symbols = request.POST['symbols'].split(',')
		for value in range(len(symbols)):
			symbols[value] = symbols[value].strip()
		roc = []
		pe_ratio = []
		for value in symbols:
			value = value.upper()
			# result = r.get(self.lookup_url + value).json()
			url = r.get("http://174.129.18.141/companies/" + value + "/pe_ratio")
			soup = bs4(url.text)
			pe_text = soup.select('span#pgNameVal')
			pe_text = pe_text[0].text
			pe_text = re.split('\s+', pe_text)
			for i in range(1):
				pe_ratio.append(float(pe_text[i]))
			url = r.get("http://www.gurufocus.com/term/ROC_JOEL/" + value + "/Return%252Bon%252BCapital%252B%252B-%252BJoel%252BGreenblatt/")
			soup = bs4(url.text)
			for div in soup.select('.data_value'):
				roc.append(float(div.get_text()[:-19]))
		magic_dict = {}
		counter = -1
		for value in symbols:
			counter+=1
			magic_dict[value] = {"magic number":roc[counter]-pe_ratio[counter]}
		print(magic_dict)
		return JsonResponse({'magic_dict':magic_dict})

コード例 #8

0

ファイルを表示

ファイル: kou.py プロジェクト: madhatter1605/fun

    def do_grades(self, ln):
        if self.is_login:
            try:
                self.response['grades'] = self.session.get(GRADE_URL)
                current_term = bs4(self.response['grades'].text, 'lxml') \
                    .find('option', selected=True)['value']

                self.request['Donem'] = current_term
                self.request['Ara'] = 'Listele'
                self.response['grades'] = self.session.post(GRADE_URL,
                                                            data=self.request)

                table = bs4(self.response['grades'].text, 'lxml') \
                    .find_all('table',
                              'table table-bordered '
                              'table-condensed')[0]

                parsed_table = table.findChildren(['th', 'tr'])
                out_table = PrettyTable(
                    ['DERS', 'AKTS', 'VIZE', 'FIN', 'BÜT', 'BN'])

                for i in range(1, len(parsed_table)):
                    a = [str(n.text).strip()
                         for n in parsed_table[i].findChildren('td')]
                    a = [a[2].split('(')[0]] + a[3:8]
                    out_table.add_row(a)
                print(out_table)
            except:
                print(colored(">>>> Notlar işlenirken hata oluştu", 'red'))
        else:
            self.do_login(ln)
            self.do_grades(ln)

コード例 #9

0

ファイルを表示

ファイル: module_steamsale.py プロジェクト: jmbjr/pyfibot

def command_price(bot, user, channel, args):
    """.price [Steam game name] - Find whether a currently on-sale Steam game has ever been on sale for cheaper"""
    search = args.replace(" ","+")
#     req = urllib2.Request("http://store.steampowered.com/search/?term=%s&category1=998" % search, headers={"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
#     db = bs4(urllib2.urlopen(req).read())
    db = bs4(urllib.urlopen("http://store.steampowered.com/search/?term=%s&category1=998" % search))
    row = db.find(class_ = "search_result_row")
    appid = row['data-ds-appid']
    xml = requests.get("http://steamsales.rhekua.com/xml/sales/app_%s.xml" % appid)
    pricehist = bs4(xml.text)
    
    name = row.find("span", {'class': 'title'}).string
    price = row.find(class_ = "search_price").text
    price = price[price.rfind('$'):]

    current = float(price[1:])
    
    lowest = current
    date = "never"
    for entry in pricehist.find_all('set'):
        price = float(entry['value'])
        if price < lowest:
            lowest = price
            date = entry['name']
        elif price == lowest:
            if not date == "never":
                date = entry['name']
    
    if lowest == current:
        bot.say(channel, name + " has never been cheaper than the $" + str(current) + " it is right now!")
    else:
        bot.say(channel, name + " is $" + str(current) + " now, but was $" + str(lowest) + " on " + date)

コード例 #10

0

ファイルを表示

ファイル: crawl.py プロジェクト: pakoo/shantie

def get_all_city_info():
    r = requests.get('http://www.pm25.in/rank')
    soup = bs4(r.content)
    citys = soup.find_all('tr')[1:]
    today = datetime.datetime.now()
    now = datetime.datetime(today.year,today.month,today.day,today.hour)
    for c in citys:
        td = c.find_all('td')
        print 
        data = {'name':td[1].a.string,'name_py':td[1].a['href'][1:]}
        data['rank'] = td[0].string
        data['AQI'] = td[2].string
        data['level'] = td[3].string
        data['PM25'] = td[5].string
        data['PM10'] = td[6].string
        data['CO'] = td[7].string
        data['SO2'] = td[11].string
        data['time'] = now
        print data
        mdb.con['air'].pmcn.update(
                    {'name':data['name']},
                    {'$set':{
                    'AQI':data['AQI'],
                    'PM25':data['PM25'],
                    'PM10':data['PM10'],
                    'CO':data['CO'],
                    'SO2':data['SO2'],
                    'rank':data['rank'],
                    'level':data['level'],
                    'time':data['time']
                    }
                    },
                    upsert=True)

コード例 #11

0

ファイルを表示

ファイル: downloader.py プロジェクト: heyitsjames/natgeo-downloader

def get_image_link_and_title():
    try:
        page = urllib2.urlopen(
            "http://photography.nationalgeographic.com/photography/photo-of-the-day/")
    except urllib2.URLError:
        # Looks like it didn't work, just return from the function 
        # and try again at the next interval
        print "there was an error opening the url"
        return None, None
    soup = bs4(page)
    try:
        link = soup.find('div', class_='download_link').find('a').attrs['href']
    except AttributeError:
        #looks like there wasn't a download link. 
        # just grab the low-res image instead
        try:
            link = soup.find('div', class_='primary_photo').find('img').attrs['src']
        except AttributeError:
            # couldn't find the right div, 
            # try again next interval (not a robust solution)
            print "there was an error parsing the html"
            return None, None


    # now, prepend http onto the url
    link = 'http:{0}'.format(link)
    title = soup.find('div', id='page_head').find('h1').contents[0].encode('ascii', 'ignore')
    return link, title

コード例 #12

0

ファイルを表示

ファイル: get_data.py プロジェクト: herch0/avito_data

def get_details_apprt(url):
	r_details = requests.get(url)
	html = bs4(r_details.text, 'html.parser')
	titre = re.sub('\s+', ' ', html.select('.page-header.mbm')[0].text)
	prix = html.select('.vi-price-label .amount')
	if len(prix) == 0:
		return None
	prix = prix[0]['title']
	aside_infos = html.select('aside.panel.panel-body.panel-info')[0].text
	m = re.match(r'.*?Nombre de pièces: ([0-9]+).*?', aside_infos, re.DOTALL)
	if m:
		nb_pieces = m.group(1)

	m = re.match('.*?Surface: (.*? m²).*?', aside_infos, re.DOTALL)
	if m:
		surface = m.group(1)
	m = re.match(r'.*?Secteur: ([\w_-]+).*?', aside_infos, re.DOTALL)
	if m:
		secteur = m.group(1)
	m = re.match(r".*?Adresse: ([\w'_\s-]+).*?Type", aside_infos, re.DOTALL)
	if m:
		adresse = re.sub(r"\s+", " ", m.group(1))

	date = html.select('.date.dtstart.value')[0]['title']
	annee_mois = extract_date_infos(date)
	annee = annee_mois[0]
	mois = annee_mois[1]
	annonce_details = {'titre': titre, 'prix': prix, 'nb_pieces': nb_pieces, 'surface': surface, 'secteur': secteur, 'adresse': adresse, 'annee': annee, 'mois': mois}
	return annonce_details

コード例 #13

0

ファイルを表示

ファイル: findfeed.py プロジェクト: alabarga/SocialLearning

def findfeed(site):
    raw = requests.get(site).text
    result = []
    possible_feeds = []
    html = bs4(raw)
    feed_urls = html.findAll("link", rel="alternate")
    if len(feed_urls) > 1:
        for f in feed_urls:
            t = f.get("type",None)
            if t:
                if "rss" in t or "xml" in t:
                    href = f.get("href",None)
                    if href:
                        possible_feeds.append(href)
    parsed_url = urllib.parse.urlparse(site)
    base = parsed.scheme+"://"+parsed_url.hostname
    atags = html.findAll("a")
    for a in atags:
        href = a.get("href",None)
        if href:
            if "xml" in href or "rss" in href or "feed" in href:
                possible_feeds.append(base+href)
    for url in list(set(possible_feeds)):
        f = feedparser.parse(url)
        if len(f.entries) > 0:
            if url not in result:
                result.append(url)
    return(result)

コード例 #14

0

ファイルを表示

ファイル: wallpaper.py プロジェクト: HoffmannP/BASH-scripts-an-stuffalike

def categorieIgnore(url, ignoreCategories):
    page = bs4(get(url).text, "lxml")
    categories = page.select("#content ul")[1].select("a")
    for categorie in categories:
        if categorie.get_text().split("/")[0] in ignoreCategories:
            return True
    return False

コード例 #15

0

ファイルを表示

ファイル: get_vote_subjects.py プロジェクト: horacepan/SenateVoting

def get_area(url):
    base_url =  'https://www.govtrack.us'

    resp = requests.get(url)
    soup = bs4(resp.text, 'lxml')
    div = soup.find('div', {'id': 'vote_explainer'})
    links = div.findAll('a')
    if len(links) == 0:
        return "Nomination" 

    for a in links:
        #print(a)
        if is_absolute(a['href']):
            #print('is absolute')
            next_url = a['href']
        else:
            #print('relative')
            next_url = base_url + a['href']
        #print("current url: %s\nnext url: " %url, next_url)
        if 'members' in next_url:
            continue
        elif 'govtrack.us' in next_url:
            return get_from(next_url)
        else:
            continue
    raise Exception("Couldnt find link to follow!", url)

コード例 #16

0

ファイルを表示

ファイル: reference.py プロジェクト: jamesgarfield/MongoBot

    def ud(self):
        if not self.values:
            self.chat("Whatchu wanna know, bitch?")
            return

        try:
            request = pageopen('http://www.urbandictionary.com/define.php',
                               params={'term': ' '.join(self.values)})
            soup = bs4(request.text)
        except:
            self.chat("parse error")
            return

        elem = soup.find('div', {'class': 'meaning'})

        try:
            defn = []
            for string in elem.stripped_strings:
                defn.append(string)
        except:
            self.chat("couldn't find anything")


        if defn:
            # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML
            # entities like &#x27; so use the parser for any stray entities.
            for paragraph in defn:
                wrapped = textwrap.wrap(paragraph, 200)
                for line in wrapped:
                    self.chat(unescape(line))
        else:
            self.chat("couldn't find anything")

コード例 #17

0

ファイルを表示

ファイル: movie.py プロジェクト: kannibalox/PTPAPI

 def load_html_data(self):
     """Scrape all data from a movie's HTML page"""
     soup = bs4(session.base_get("torrents.php", params={'id': self.ID}).text, "html.parser")
     self.data['Cover'] = soup.find('img', class_='sidebar-cover-image')['src']
     # Title and Year
     match = re.match(r'(.*) \[(\d{4})\]', soup.find('h2', class_='page__title').encode_contents())
     self.data['Title'] = match.group(1)
     self.data['Year'] = match.group(2)
     # Genre tags
     self.data['Tags'] = []
     for tagbox in soup.find_all('div', class_="box_tags"):
         for tag in tagbox.find_all("li"):
             self.data['Tags'].append(tag.find('a').string)
     self.data['Directors'] = []
     for director in soup.find('h2', class_='page__title').find_all('a', class_='artist-info-link'):
         self.data['Directors'].append({'Name': director.string})
     # File list & trumpability
     for tor in self['Torrents']:
         # Get file list
         filediv = soup.find("div", id="files_%s" % tor.ID)
         tor.data['Filelist'] = {}
         basepath = re.match(r'\/(.*)\/', filediv.find("thead").find_all("div")[1].get_text()).group(1)
         for elem in filediv.find("tbody").find_all("tr"):
             bytesize = elem("td")[1]("span")[0]['title'].replace(",", "").replace(' bytes', '')
             filepath = os.path.join(basepath, elem("td")[0].string)
             tor.data['Filelist'][filepath] = bytesize
         # Check if trumpable
         if soup.find(id="trumpable_%s" % tor.ID):
             tor.data['Trumpable'] = [s.get_text() for s in soup.find(id="trumpable_%s" % tor.ID).find_all('span')]
         else:
             tor.data['Trumpable'] = []

コード例 #18

0

ファイルを表示

ファイル: module_giantbomb.py プロジェクト: jmbjr/pyfibot

def command_gb(bot, user, channel, args):
    """.gb upcoming - Returns any posted upcoming items at GiantBomb.com (it's a website about video games)"""
    global videos
    if args:
        cmds = args.split()
        subcommand = cmds[0]
        if subcommand == "ql":
            bot.say(channel, "Latest QL: %s" % videos['ql'])
        elif subcommand == "feature":
            bot.say(channel, "Latest Feature: %s" % videos['feature'])
        elif subcommand == "sub":
            bot.say(channel, "Latest Subscriber Content: %s" % videos['sub'])
        elif subcommand == "article":
            bot.say(channel, "Latest Article: %s" % videos['article'])
        elif subcommand == "review":
            bot.say(channel, "Latest Review: %s" % videos['review'])
        elif subcommand == "bombastica":
            bot.say(channel, "Latest Bombastica: %s" % videos['bombastica'])
        elif subcommand == "upcoming":
            page = bs4(urllib.urlopen("http://www.giantbomb.com/"))
            upcoming = page.find("dl", {"class": "promo-upcoming"})
            slots = upcoming.find_all("dd")
            if len(slots) == 0:
                bot.say(channel, "No items on the upcoming list! Alert @GiantBombStats!")
            else:
                if len(cmds) > 1 and cmds[1] == "nopat":
                    before = len(slots)
                    slots = [slot for slot in slots if not str(slot.find("h4").text).__contains__("Scoops")]
                    bot.say(channel, "NOPAT MODE ACTIVATED - %s ITEMS ELIMINATED" % (before - len(slots)))
                bot.say(channel, "%d Upcoming Items (times in EST):" % len(slots))
                for slot in slots:
                    text = slot.find("h4").text
                    time = slot.find("p").text
                    bot.say(channel, "%s - %s" % (text, time))

コード例 #19

0

ファイルを表示

ファイル: scrape_mosques.py プロジェクト: AncientSwordRage/mosque_voronoi

 def extract_mosque(self, mosque, page):
     """Extract Mosque."""
     fields = [field.attname.replace('_id', '')
               for field in Mosque._meta.fields
               if field.attname != 'id']
     mosque_text = re.sub(u"(\u2018|\u2019)", "'", mosque.text)
     mosque_link = '/'.join([root_url, mosque.find('a').get('href').split('../')[-1]])
     log_text = '\nWriting {} to file, from page {}'
     self.stdout.write(log_text.format(mosque_text, page))
     mosque_page = bs4(requests.get(mosque_link).content, 'html.parser')
     rows_selector = '#mosque_info_contents table table tr'
     mosque_info_rows = mosque_page.select(rows_selector)
     values = {}
     # page is a giant table, so go over the rows
     for row in mosque_info_rows:
         cells = row.find_all('td')
         # check we have the right fields
         try:
             key = cells[0].text.replace(':', '').lower().strip().replace(' ', '_')
         except (IndexError, AttributeError):
             import pdb; pdb.set_trace()
             # if no key or replace atribute, probably don't want it
             continue
         if len(cells) == 2 and key in fields:
             values[key] = cells[1].text
     name_address = mosque_page.select('#results h1')
     matches = re.match(r'(?P<name>[^(]*)\(', name_address[0].text)
     values['name'] = matches.group('name').strip()
     values['rating'] = mosque_page.select('.star_rating strong')[0].text
     values['mdpk'] = mosque_link.split('/')[-1]
     self.stdout.write(str(set(values.keys()) ^ set(fields)))

コード例 #20

0

ファイルを表示

ファイル: main.py プロジェクト: jataggart/xkcd-dl

def update_dict():
    """
    getting the info from the archive page. url="http://xkcd.com/archive/" 
    """
    archive_page = requests.get(ARCHIVE_URL)
    if archive_page.status_code == 200:
        page_content = archive_page.content
        archive_soup = bs4(page_content, "html.parser")

        ## now get all the <a> tags under the div '<div class="box" id="middleContainer">' from the soup object
        for data in archive_soup.find_all("div", {"id": "middleContainer"}):
            ## this gets all the contents inside "<div class="box" id="middleContainer">"
            ## now to get the individual links
            for alinks in data.find_all("a"):  ## tries to get all the <a> tags from the 'data' object
                href = alinks.get("href").strip("/")  ## the href stored is in form of eg: "/3/". So make it of form "3"
                date = alinks.get("title")
                description = alinks.contents[0]
                make_keyvalue_list(href, date, description)

        with open(xkcd_dict_filename, "w") as f:
            json.dump(XKCD_DICT, f)
            print(
                "XKCD link database updated\nStored it in '{file}'. You can start downloading your XKCD's!\nRun 'xkcd-dl --help' for more options".format(
                    file=xkcd_dict_filename
                )
            )

    else:
        print("Something bad happened!")

コード例 #21

0

ファイルを表示

ファイル: launchCalendar.py プロジェクト: rdustinb/WrenchAndGear

def getAllLaunches():
  launchEntry = 0
  tableDepth = 0
  launchEvents = list()
  global listCount
  try:
    # Grab the entire page
    launchCalHandle = urllib.request.urlopen('http://www.spaceflightinsider.com/launch-schedule/')
    launchCalHtml = launchCalHandle.read()
    soup = bs4(launchCalHtml, 'html.parser')
    # Cleanup the Launch Entries as a string with consistent spacing, allows
    # better modularization of the script.
    for launchEvent in soup.body.find_all(launch_table)[1:]:
      # Increment the list counter
      listCount += 1
      launchFields = list()
      launchString = re.sub(' +', ' ', launchEvent.prettify().replace('\n', ' ').replace('\r', ''))
      # print(launchString)
      # Get the launchID
      launchFields.append(launchString.split('"launchcalendar" id="')[1].split('"> <tr>')[0].strip())
      # Get the date, bypass non-hard-scheduled launches
      launchFields.append(launchString.split('</span> <span>')[1].split(' </span>')[0].strip())
      if(
      not('Jan' in launchFields[-1]) and not('Feb' in launchFields[-1]) and
      not('Mar' in launchFields[-1]) and not('Apr' in launchFields[-1]) and
      not('May' in launchFields[-1]) and not('Jun' in launchFields[-1]) and
      not('Jul' in launchFields[-1]) and not('Aug' in launchFields[-1]) and
      not('Sep' in launchFields[-1]) and not('Oct' in launchFields[-1]) and
      not('Nov' in launchFields[-1]) and not('Dec' in launchFields[-1])):
        continue
      # Get the time, bypass non-hard-scheduled launches
      if("Time" in launchString):
        if("TBD" in launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip()):
          continue
        else:
          tempTime = splitTimeFields(launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip())
          for timeField in tempTime:
            launchFields.append(timeField)
      else:
        continue
      # Get the Location
      launchFields.append(launchString.split('<th> Location </th> <td>')[1].split('</td>')[0].strip())
      # Get the Satellite
      launchFields.append(launchString.split('<th colspan="2">')[1].split('</th>')[0].strip())
      # Get the Launch Vehicle
      if("<wbr>" in launchString.split('<br/>')[1].split('</td>')[0].strip()):
        launchFields.append(re.sub(' </wbr>', '', re.sub(' <wbr> ', '', launchString.split('<br/>')[1].split('</td>')[0].strip())))
      else:
        launchFields.append(launchString.split('<br/>')[1].split('</td>')[0].strip())
      # Get the description
      launchFields.append(launchString.split('"description" colspan="2"> <p>')[1].split('</p>')[0].strip())
      # Convert Stored Data to writeEvent()
      writeEvent(convertLaunchData(launchFields))
  except urllib.error.HTTPError:
    print("There was an error accessing the Space Flight Insider Launch Schedule.")
    print("The server could be down or having issues. Try again.")
  except urllib.error.URLError:
    print("There was an error decoding the URL for the Space Flight Insider Launch Schedule. :::nodename not known :::")
    print("Check that your computer has access to the Internet.")

コード例 #22

0

ファイルを表示

ファイル: api.py プロジェクト: kannibalox/PTPAPI

 def contest_leaders(self):
     """Get data on who's winning"""
     LOGGER.debug("Fetching contest leaderboard")
     soup = bs4(session.base_get("contestleaders.php").content, "html.parser")
     ret_array = []
     for cell in soup.find('table', class_='table--panel-like').find('tbody').find_all('tr'):
         ret_array.append((cell.find_all('td')[1].get_text(), cell.find_all('td')[2].get_text()))
     return ret_array

コード例 #23

0

ファイルを表示

ファイル: d99.py プロジェクト: wenLiangcan/d99

 def _get_volumes_0_1(self, html):
     soup = bs4(html, 'lxml')
     lis = soup.select_one('.vol > .bl').find_all('li')
     vols = [
         (li.a.text, 'http://{}{}'.format(self.domain, li.a.attrs['href']))
         for li in lis
     ]
     return self._sort_vol_by_title(vols)

コード例 #24

0

ファイルを表示

ファイル: wallpaper.py プロジェクト: HoffmannP/BASH-scripts-an-stuffalike

def categorie_ignore(url, ignore_categories):
    """ ignores defined categories of wallappers  """
    page = bs4(get(url).text, "lxml")
    categories = page.select('#content ul')[1].select('a')
    for categorie in categories:
        if categorie.get_text().split('/')[0] in ignore_categories:
            return True
    return False

コード例 #25

0

ファイルを表示

ファイル: api.py プロジェクト: kannibalox/PTPAPI

 def log(self):
     """Gets the PTP log"""
     soup = bs4(session.base_get('/log.php').content, "html.parser")
     ret_array = []
     for message in soup.find('table').find('tbody').find_all('tr'):
         ret_array.append((message.find('span', class_='time')['title'],
                           message.find('span', class_='log__message').get_text().lstrip().encode('UTF-8')))
     return ret_array

コード例 #26

0

ファイルを表示

ファイル: app.py プロジェクト: eunicekokor/devfest16

def check_review(query):
  results = []
  soup = bs4(requests.get(query).text, 'html.parser')
  soup = soup.findAll('p')
  for paragraph in soup:
    paragraph = str(paragraph).replace('<p>','')
    paragraph = str(paragraph).replace('</p>','')
  return soup

コード例 #27

0

ファイルを表示

ファイル: DMiner.py プロジェクト: revizor1/DMiner

def MainSearch(keyword, base="http://seeker.dice.com", tail="/jobsearch/servlet/JobSearch?op=100&NUM_PER_PAGE=5&FREE_TEXT="):
    """
    Get job listings from main keyword search and returns bs
    """
    url = base + tail + keyword
    resp = urllib.request.urlopen(url)
    soup = bs4(resp.read(), from_encoding=resp.info().get_param('charset'))
    return soup

コード例 #28

0

ファイルを表示

ファイル: dumpJson.py プロジェクト: spk921/scrapers

def getCareerHistory(url):
    req = re.get(url)
    soup = bs4(req.text, 'html.parser')
    careerHistory        = soup.findAll("div", { "class" : "markets_module bio_career" })
    corporateInformation = soup.findAll("div", { "class" : "markets_module corporate_info" })
    memberShips          = soup.findAll("div", { "class" : "markets_module bio_membership" })

    return str(careerHistory), str(corporateInformation), str(memberShips)

コード例 #29

0

ファイルを表示

ファイル: arconai.py プロジェクト: josefson/dotfiles

 def shows(self):
     """Read shows into list of Shows."""
     response = requests.get(self.base_url)
     source = response.text
     soup = bs4(source, "html.parser")
     show_tags = soup.select("a[title]")
     show_tags = [tag for tag in show_tags if tag.has_attr("class") is False]
     shows = [Show(tag["title"], tag["href"]) for tag in show_tags]
     return shows

コード例 #30

0

ファイルを表示

ファイル: ParsePage.py プロジェクト: spamdummy/tweet-classifier

def html_to_text(html):
	soup = bs4(html)
	b = soup.find(id="bodyContent")
	if b:
		#wikipedia page
		return b.get_text()
	else:
		#fall back onto just grabbing all text
		return soup.get_text()

コード例 #31

0

ファイルを表示

    def stats(self):
        """
        Return all stats associated with a user

        :rtype: A dictionary of stat names and their values, both in string format.
        """
        soup = bs4(
            session.base_get('user.php', params={
                'id': self.ID
            }).text, "lxml")
        stats = {}
        for li in soup.find('span', text='Stats').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            stats[stat] = value
        for li in soup.find('span',
                            text='Personal').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            if value:
                stats[stat] = value
        for li in soup.find('span',
                            text='Community').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            if stat == "Uploaded":
                match = re.search(r'(.*) \((.*)\)', value)
                stats["UploadedTorrentsWithDeleted"] = match.group(1)
                value = match.group(2)
                stat = "UploadedTorrents"
            elif stat == "Downloaded":
                stat = "DownloadedTorrents"
            elif stat == "SnatchesFromUploads":
                match = re.search(r'(.*) \((.*)\)', value)
                stats["SnatchesFromUploadsWithDeleted"] = match.group(1)
                value = match.group(2)
            elif stat == "AverageSeedTime(Active)":
                stat = "AverageSeedTimeActive"
            stats[stat] = value
        return stats

コード例 #32

0

ファイルを表示

    def getContent(self, ARTICLE_List, record):
        newsLists = []
        articleIDList = []
        driver = webdriver.PhantomJS()
        for articleURL in ARTICLE_List:
            if articleURL in record:
                continue
            sys.stdout.write('\r             ' + ' ' * 65)
            sys.stdout.write('\r    URL: ' + articleURL[:69])
            t.sleep(random.randint(5, 8))
            r = driver.get(articleURL)
            pageSource = driver.page_source
            soup = bs4(pageSource, 'html.parser')
            news = soup.find(class_='main-container')
            content = ""
            title = str(news.find('p').text)
            time = re.split('/', news.find(class_='date-display-single').text)
            datetime = '/'.join(time[:3]) + ' 00:00'
            article = news.find(
                class_='node node-post node-promoted clearfix').findAll('p')

            #filter fault news
            if t.strftime('%Y/%m/%d', t.localtime()) not in datetime:
                continue
            else:
                pass

            for contents in article:
                content += contents.text

            articleID = ''.join(time) + '0000000'
            while articleID in articleIDList:
                articleID = str(int(articleID) + 1)
            articleIDList.append(articleID)
            articleID = 'cld' + articleID
            newsLists.append([articleID, articleURL, title, datetime, content])
        return newsLists

コード例 #33

0

ファイルを表示

ファイル: APOPND_Ovencoffee.py プロジェクト: appleasd1230/Josh_Projects

    def APOPND_Ovencoffee_crawler(self):
        ContentList = []

        for hundreds in range(10):
            for tens in range(10):
                for units in range(10):
                    page = str(hundreds) + str(tens) + str(units)
                    url = "http://www.ovencoffee.com.tw/store_list.asp?storeid=" + page
                    '''建立Return機制, 因為可能會建立多組連線'''
                    res = requests.Session()
                    res.keep_alive = False
                    retry = Retry(connect=5, backoff_factor=0.5)
                    adapter = HTTPAdapter(max_retries=retry)
                    res.mount('https://', adapter)
                    res = res.get(url, headers=headers)
                    res.encoding = ("utf-8")

                    soup = bs4(res.text, 'html.parser')
                    # print(soup)
                    data = soup.findAll('p')

                    if data[1].text != "" and data[1].text is not None:
                        name = data[1].text
                        phone = re.split(':', data[2].text)[1]
                        business_time = str.join('',
                                                 re.split(':', data[3].text))
                        ContentList.append([name, phone, business_time])

        Filename = 'APOPND_Ovencoffee.csv'  #這裡要改
        storage_dir = "data/csv/"  #這裡要改

        df = pd.DataFrame(data=ContentList,
                          columns=['Name', 'Phone', 'Business_time'])
        df.to_csv(storage_dir + Filename,
                  sep=',',
                  encoding='utf_8_sig',
                  index=False)

コード例 #34

0

ファイルを表示

ファイル: sisal.py プロジェクト: simonemastella/SureBetFinder

def scrap(link):
    try:    
        session = HTMLSession()
        with session.get(link) as res:
            res.html.render() 
            soup = bs4(res.html.html, 'html5lib')
            risultato=pd.DataFrame(columns=['tipo','casoF','casoV'])
            tags= soup.findAll("div",{"class":"TabellaEsitiRow-hvzh1w-0 jxwiSe"})
            esdop=tags[0].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"})
            new_row = {'tipo':"1X-2", 'casoF':esdop[3].getText(), 'casoV':esdop[2].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"12-X", 'casoF':esdop[5].getText(), 'casoV':esdop[1].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"2X-1", 'casoF':esdop[4].getText(), 'casoV':esdop[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            gng=tags[1].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"})
            new_row = {'tipo':"GOL/NOGOL", 'casoF':gng[1].getText(), 'casoV':gng[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            over=tags[2].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"})
            under=tags[3].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"})
            new_row = {'tipo':"UNDER/OVER 0.5", 'casoF':under[0].getText(), 'casoV':over[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 1.5", 'casoF':under[1].getText(), 'casoV':over[1].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 2.5", 'casoF':under[2].getText(), 'casoV':over[2].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 3.5", 'casoF':under[3].getText(), 'casoV':over[3].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 4.5", 'casoF':under[4].getText(), 'casoV':over[4].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 5.5", 'casoF':under[5].getText(), 'casoV':over[5].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
        return risultato
    except:
        session.close()
        print("Errore nella ricerca DATI su SISAL, cerco di nuovo")
        return scrap(link)

コード例 #35

0

ファイルを表示

ファイル: sisal.py プロジェクト: simonemastella/SureBetFinder

def scrapCampionato(num):
    campionato=["https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:21",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:22",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:18",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:153",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:86",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:1",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:79",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:137",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:4",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:3",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:14",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:15",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:29",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:30",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:54",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:190"]
    #italia, champions e europa, inghilterra, spagna, germania, francia, olanda, portogallo
    risultato=pd.DataFrame(columns=['giorno','ora','match','link'])
    session = HTMLSession()
    with session.get(campionato[num]) as res:
        res.html.render() 
        soup = bs4(res.html.html, 'html5lib')
        partite= (soup.findAll("div",{"TabellaEsitiRow-hvzh1w-0 eyTFpO"}))
        for partita in partite:
            match=partita.find("span",{"class":"AvvenimentoDescription-rieyuj-0 clFosV"}).getText().strip()
            dataora=partita.find("span",{"class":"AvvenimentoDate-giucxs-0 iaSisn"}).getText().strip().split(" ")
            link ="https://www.sisal.it"+partita.find("a",{"class":"AvvenimentoDetailWrapper-w9f4wf-0 bhgtKE"}).get("href")
            ora=dataora[2]
            data=dataora[0].split("/")
            new_row = {'giorno':data[0], 'ora':ora, 'match':match,'link':link}
            risultato = risultato.append(new_row, ignore_index=True)
    if len(risultato)!=0:
        return risultato
    else:
        print("SISAL RIPROVO")
        return scrapCampionato(num)

コード例 #36

0

ファイルを表示

ファイル: data.py プロジェクト: kirillmasanov/tg_price_bot

def get_data(html):
    item_list = []
    soup = bs4(html, 'lxml')
    positions = soup.find_all(
        'div',
        class_=
        'products-view-block js-products-view-block products-view-block-static'
    )
    for position in positions:
        pos_article = position.find('div', class_='col-xs-8 align-right').text
        pos_name = position.find('span', class_='products-view-name-link').text
        pos_price = position.find('div', class_='price-number').text
        data = {
            'article': norm_article(pos_article),
            'name': norm_name(pos_name),
            'price': norm_price(pos_price)
        }
        # print(f'Артикул: {norm_article(pos_article)}')
        # print(f'Наименование: {norm_name(pos_name)}')
        # print(f'Цена: {norm_price(pos_price)} руб.')
        # print('=' * 20)
        # print(data)
        item_list.append(data)
    return item_list

コード例 #37

0

ファイルを表示

ファイル: functions.py プロジェクト: lunbon/rocketbot

def get_ranks_by_nikname(platform, nikname):
    first_role = 'Unranked (2vs2)'
    second_role = 'Unranked (3vs3)'
    try:
        response = requests.get(url % (platform, nikname))
        html = response.text
        soup = bs4(html, 'html.parser')
        for tab in soup.find_all('table'):
            if 'Playlist' in str(tab):
                table = tab
                break
        playTable = table
        trs = playTable.find_all('tr')
        for tr in trs[1:]:
            if 'Ranked Doubles 2v2' in str(tr.find_all('td')[1]):
                r2v2 = tr.small
                first_role = (str(r2v2).split('\n')[1] + ' (2vs2)').strip()
            if 'Ranked Standard 3v3' in str(tr.find_all('td')[1]):
                r3v3 = tr.small
                second_role = (str(r3v3).split('\n')[1] + ' (3vs3)').strip()

        return (first_role, second_role)
    except:
        return False

コード例 #38

0

ファイルを表示

ファイル: instagram.py プロジェクト: PaulSayantan/instagram-download-cli

def insta(username, gecko_path):
    """
    Download images from instagram
    """

    link = URL + username
    print("Downloading images {}...".format(username))

    with Browser("firefox", headless=True,
                 executable_path=gecko_path) as browser:
        browser.visit(link)
        html = browser.html
        soup = bs4(html, "html.parser")

    data = soup.findAll("img")

    for x in data:
        x = x.get("src")
        os.system(
            f"wget --no-check-certificate -c -N -P Images/{username} {x}")
        print("Downloaded {}".format(x))

    def rename_image_dir(foldername):
        i = 1
        dirName = os.path.join("./Images/", foldername)
        path = os.getcwd() + dirName
        for filename in os.listdir(dirName):
            if not filename.endswith(".jpg"):
                os.rename(
                    os.path.join(path, filename),
                    os.path.join(path, foldername + '_' + str(i) + ".jpg"))
            i += 1

    rename_image_dir(username)

    print("\nFiles downloaded into Images/{}".format(username))

コード例 #39

0

ファイルを表示

ファイル: hospital_spider.py プロジェクト: danyuzhen/python

def get_baike_text(hospital_list,urls):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'baike.baidu.com',
        "Referer": "https://baike.baidu.com/",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
    }
    for i in range(len(urls)):
        html = req.get(urls[i], headers=headers).content.decode('utf8')
        soup=bs4(html,'lxml')
        des = soup.find('div', {'class': 'lemma-summary'})
        if not des is None:
            des=re.sub('\[[0-9]{1,2}\]', "", des.get_text()).replace('\n','').replace('\xa0','')\
                .replace('"','“').replace("'","‘")
            sql='update hospital set description="'+des+'" where name="'+hospital_list[i]+'"'
            print('剩余'+str(len(urls)-i)+':'+urls[i])
            cursor.execute(sql)
            db.commit()
        else:
            print('未收录：'+urls[i])
            continue

コード例 #40

0

ファイルを表示

ファイル: craigslist_car_scraper.py プロジェクト: Naunett/cars_project

def scrape_search_results(areas):
    '''
    scrapes search page, collects information about the cars available for sale.
    '''
    results = []  
    search_indices = np.arange(0, 300, 100)
    for area in areas:
        print area
        for i in search_indices:
            url = 'http://sfbay.craigslist.org/search/{0}/cta'.format(area)
            resp = requests.get(url, params={'hasPic': 1, 's': i})
            txt = bs4(resp.text, 'html.parser')
            cars = txt.findAll(attrs={'class': "row"})
            tags=txt.findAll('img')
            img_tags = "\n".join(set(tag['src'] for tag in tags))
            title = [rw.find('a', attrs={'class': 'hdrlnk'}).text
                          for rw in cars]
            links_raw = [rw.find('a', attrs={'class': 'hdrlnk'})['href']
                     for rw in cars]
            links = ['http://sfbay.craigslist.org'+car_link for car_link in links_raw]
            # find the time and the price
            time = [pd.to_datetime(rw.find('time')['datetime']) for rw in cars]
            price = find_prices(cars)

            # create a dataframe to store all the data
            data = np.array([time, price, title, links])
            col_names = ['time', 'price', 'title', 'link']
            df = pd.DataFrame(data.T, columns=col_names)

            # add the location variable to all entries
            df['loc'] = area
            results.append(df)

    # concatenate all the search results
    results = pd.concat(results, axis=0)
    return results

コード例 #41

0

ファイルを表示

def main(prefix, docnum):
    #获取提单号
    # prefix= '784'
    # docnum = '27400365'
    open_text()
    CZ._data['ctl00$ContentPlaceHolder1$txtPrefix'] = prefix
    CZ._data['ctl00$ContentPlaceHolder1$txtNo'] = docnum
    r = requests.post(CZ.url + CZ.prefix + prefix + CZ.awbno + docnum +
                      CZ.menuid + CZ.lang,
                      data=CZ._data)
    content = bs4(r.content, 'html.parser')
    table = content.find_all('table')
    status = list(table[2].stripped_strings)
    flight = []
    for i in range(len(status)):
        if re.search(r'[\dA-Z]{4,10}', status[i]) and re.search(
                r'[-\d]{8,10}', status[i + 1]):
            d = {
                'air_code': '%s' % status[i],
                '_dep_port': '%s' % status[i - 2],
                '_dest_port': '%s' % status[i - 1],
                'airline_comp': 'CZ',
            }
            flight.append(d)

    status = list(table[4].stripped_strings)
    j = 0
    for i in range(len(status)):
        if 'Cargo has been loaded' in status[i]:
            flight[j].update({'_atd': '%s' % status[i - 3]})
            flight[j].update({'_std': '%s' % status[i - 3]})
        if 'Flight has arrived' in status[i]:
            flight[j].update({'_ata': '%s' % status[i - 3]})
            flight[j].update({'_sta': '%s' % status[i - 3]})
            j += 1
    return flight

コード例 #42

0

ファイルを表示

def get_lyrics(artist, song):

    if not isinstance(artist, str):
        raise TypeError("The artist name should be a string")
    if not isinstance(song, str):
        raise TypeError("The song name should be a string")

    artist_name, song_name = _clean_names(artist, song)
    # print(artist_name, song_name)
    url = _create_url(artist_name, song_name)

    try:
        page = _get_page(url)
    except ValueError:
        return []

    soup = bs4(page, "html.parser")
    mydivs = soup.find("div", {"class": "ringtone"})
    lyrics = mydivs.find_next_sibling("div")

    # Use the .stripped_strings generator to remove all extra whitespace
    # and strings consisting only of whitespace
    lyric_list = [text for text in lyrics.stripped_strings]
    return lyric_list

コード例 #43

0

ファイルを表示

def scraper(pgs1, pgs2, srt):
    pages = []
    prices = []
    stars = []
    titles = []

    data = {'Title': titles, 'Price': prices, 'Rating': stars}

    for numPgs in range(int(pgs1), int(pgs2) + 1):
        url = (
            'http://books.toscrape.com/catalogue/category/books_1/page-{}.html'
            .format(numPgs))
        pages.append(url)
    for item in pages:
        page = requests.get(item)
        soup = bs4(page.text, 'html.parser')
        for iterA in soup.findAll('h3'):
            ttl = iterA.getText()
            titles.append(ttl)
        for iterB in soup.findAll('p', class_='price_color'):
            price = iterB.getText()
            prices.append(price)
        for iterC in soup.findAll('p', class_='star-rating'):
            for key, value in iterC.attrs.items():
                star = value[1]
                stars.append(star)

    if (srt == "title"):
        titles.sort()
    elif (srt == "price"):
        prices.sort()
    elif (srt == "rating"):
        stars.sort()

    df = pd.DataFrame(data=data)
    return df

コード例 #44

0

ファイルを表示

ファイル: snai.py プロジェクト: simonemastella/SureBetFinder

def scrap(link):
    try:
        session = HTMLSession()
        with session.get(link) as res:
            res.html.render() 
            soup = bs4(res.html.html, 'html5lib')
            tags= soup.findAll("table",{"class":"table table-bordered table-condensed table-striped table-hover margin-bottom-10 ng-scope"})
            risultato=pd.DataFrame(columns=['tipo','casoF','casoV'])
            esatto=tags[0].findAll("span",{"class":"ng-binding ng-scope"})
            doppia=tags[1].findAll("span",{"class":"ng-binding ng-scope"})
            new_row = {'tipo':"1X-2", 'casoF':doppia[0].getText().strip(), 'casoV':esatto[2].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"12-X", 'casoF':doppia[2].getText().strip(), 'casoV':esatto[1].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"2X-1", 'casoF':doppia[1].getText().strip(), 'casoV':esatto[0].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            tipo=tags[2].find("div",{"class":"pull-left ng-binding"}).getText().strip()
            t=tags[2].findAll("span",{"class":"ng-binding ng-scope"})
            quotaF=t[1].getText().strip()
            quotaV=t[0].getText().strip()
            new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV}
            if "GOL" in tipo:
                risultato = risultato.append(new_row, ignore_index=True)
            for tag in tags[3:12]:
                tipo=tag.find("div",{"class":"pull-left ng-binding"}).getText().strip()
                t=tag.findAll("span",{"class":"ng-binding ng-scope"})
                quotaF=t[0].getText().strip()
                quotaV=t[1].getText().strip()
                new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV}
                if "UNDER" in tipo:
                    risultato = risultato.append(new_row, ignore_index=True)
            return risultato
    except:
        print("Errore nella ricerca DATI su SNAI, cerco di nuovo")
        session.close()
        return scrap(link)

コード例 #45

0

ファイルを表示

ファイル: virtual_table.py プロジェクト: emkael/jfrpary-virtual-table

 def __detect_virtual_pairs(self):
     """Auto-detect virtual pairs by their record file header."""
     virtual_pairs = []
     # RegEx for matching pair number and names in pair record header
     pair_header_match = re.compile('([0-9]{1,}): (.*) - (.*), .*')
     for record_file_path in self.__pair_records_files:
         log.getLogger('detect').debug('examining record file %s',
                                       record_file_path)
         with file(record_file_path) as record_file:
             record = bs4(record_file, 'lxml')
         # first <td class="o1"> with content matching
         # pair header is what we're after
         header = [con for con
                   in record.select('td.o1')[0].contents
                   if isinstance(con, NavigableString) and re.search(
                       pair_header_match, con)]
         log.getLogger('detect').debug('detected header: %s', header)
         if len(header):
             header_match = re.match(pair_header_match, header[0])
             pair_number = int(header_match.group(1))
             names = [name for name in [header_match.group(2).strip(),
                                        header_match.group(3).strip()]
                      if len(name)]
             log.getLogger('detect').debug('parsed header: %d, %s',
                                           pair_number, names)
             # virtual pair does not have any names filled
             if len(names) == 0:
                 virtual_pairs.append(pair_number)
     if len(virtual_pairs) == 0:
         log.getLogger('detect').warning('No virtual pairs detected')
     else:
         log.getLogger('detect').info('virtual pairs: %s',
                                      ' '.join(sorted(
                                          [str(pair) for pair
                                           in virtual_pairs])))
     return sorted(virtual_pairs)

コード例 #46

0

ファイルを表示

ファイル: virtual_table.py プロジェクト: emkael/jfrpary-virtual-table

def fill_pair_list_table(cells, row_cell_count=20):
    """Format cell list into well-formed rows, aligned by column count."""
    content = bs4('<table />', 'lxml')
    content.append(content.new_tag('table'))
    # first filler cell of each new row
    first_cell = content.new_tag('td', **{'class': 'n'})
    first_cell.string = u'\xa0'
    # arrange cells into rows, full rows first
    while len(cells) >= row_cell_count:
        new_row = content.new_tag('tr')
        new_row.append(copy.copy(first_cell))
        for cell in cells[0:row_cell_count]:
            new_row.append(cell)
        content.table.append(new_row)
        log.getLogger('rec_list').debug('aligning cells %s to %s in a row',
                                        cells[0].a.contents,
                                        cells[row_cell_count-1].a.contents)
        del cells[0:row_cell_count]
    # last row may or may not be full
    last_row = content.new_tag('tr')
    last_row.append(copy.copy(first_cell))
    for cell in cells:
        last_row.append(cell)
    log.getLogger('rec_list').debug('leaving cells %s to %s in last row',
                                    cells[0].a.contents,
                                    cells[-1].a.contents)
    # if it wasn't full, fill it with a col-spanned last cell
    if len(cells) < row_cell_count:
        last_cell = content.new_tag('td',
                                    colspan=row_cell_count-len(cells))
        last_cell.string = u'\xa0'
        last_row.append(last_cell)
        log.getLogger('rec_list').debug('filling last row with: %s',
                                        last_cell)
    content.table.append(last_row)
    return content.table.contents[:]

コード例 #47

0

ファイルを表示

ファイル: aparat_playlist_downloader.py プロジェクト: rezaxd/Aparat-playlist-downloader

def videoDetail(videoUrl):
    try:
        itemPage = bs4(req.get(videoUrl).text, 'html.parser')
    except expression as e:
        # print(e)
        print('=====> request failed/ check network connection!')
    choices = [
        i['aria-label'].split(' ')[-1]
        for i in itemPage.select('.menu-list .link a')
    ]
    downloadLinks = {}
    for itemLink in itemPage.select('.menu-list .link a'):
        downloadLinks[itemLink['aria-label'].split(' ')[-1]] = itemLink['href']
    questions = [
        inq.List(
            'quality',
            message="\U0001F914  Select quality",
            choices=choices,
        ),
    ]
    answer = inq.prompt(questions)
    itemPageDownloadLink = downloadLinks[answer['quality']]
    itemTitle = f"{itemPage.select('#videoTitle')[0].text}-{answer['quality']}"
    return itemTitle, itemPageDownloadLink

コード例 #48

0

ファイルを表示

    def inbox(self, page=1):
        """Fetch a list of messages from the user's inbox
        Incidentally update the number of messages"""
        soup = bs4(
            session.base_get('inbox.php', params={
                'page': page
            }).text, "html.parser")

        self.new_messages = self.__parse_new_messages(soup)

        for row in soup.find(id="messageformtable").tbody.find_all('tr'):
            yield {
                'Subject':
                row.find_all('td')[1].text.encode('UTF-8').strip(),
                'Sender':
                row.find_all('td')[2].text,
                'Date':
                row.find_all('td')[3].span['title'],
                'ID':
                re.search(r'id=(\d+)',
                          row.find_all('td')[1].a['href']).group(1),
                'Unread':
                True if 'inbox-message--unread' in row['class'] else False
            }

コード例 #49

0

ファイルを表示

ファイル: stock.py プロジェクト: imtkusd20172018/InsurMan

def getStock(code, query):
    reply = ''
    stock = [[
        'Time', 'Market Price', 'Buy', 'Sell', 'Rise&Fall', 'Volume',
        'Previous Close', 'Open', 'High', 'Low'
    ]]
    driver = webdriver.PhantomJS(
        executable_path=
        'D:\\Anaconda3.6\\Scripts\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe'
    )
    x = code
    r = driver.get('https://tw.stock.yahoo.com/q/q?s=' + x)
    pageSource = driver.page_source
    soup = bs4(pageSource, 'html.parser')
    tables = soup.find_all('table')
    row = tables[5].find_all('td')[0].findAll('tr')[1].findAll('td')[1:-1]
    for item in row:
        row[row.index(item)] = item.text.strip()
    if query == '市價':
        reply = row[1]
    elif query == '買價':
        reply = row[2]
    elif query == '賣價':
        reply = row[3]
    elif query == '成交量':
        reply = row[5]
    elif query == '前一天收盤價':
        reply = row[6]
    elif '開盤' in query:
        reply = row[7]
    elif '最高' in query:
        reply = row[8]
    elif '買低' in query:
        reply = row[9]

    return (code + ' 的' + query + '是 ' + reply + 'ㄛ~~~~<3')

コード例 #50

0

ファイルを表示

ファイル: parse_tzcs.py プロジェクト: cinit/Xinshou

def parse(html):
    res = {
        '身高测量': -1,
        '体重测量': -1,
        '肺活量': -1,
        '50米跑': -1,
        '立定跳远': -1,
        '1000米跑': -1,
        '800米跑': -1,
        '坐体前屈': -1,
        '仰卧起坐': -1,
        '引体向上': -1,
        '左眼视力': -1,
        '右眼视力': -1
    }
    html = bs4(html, 'html.parser')
    tr = html.find_all('tr')
    for i in tr:
        td = i.find_all('td')
        if not td:
            continue
        if td[0].text in res:
            res[td[0].text] = td[1].text
    return res

コード例 #51

0

ファイルを表示

    def get_followers(self):

        time.sleep(2)
        #open followers link of followers on any account
        # right click followers link on the page then right click on the selected script and copy selector
        flw_btn = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                '#react-root > section > main > div > header > section > ul > li:nth-child(2) > a'
            )))
        flw_btn.click()
        # When the pop up of followers appears webpage changes
        # so we click on follower button, then pop up appears then we click inspect , then we check the divison which get highlighted when we scroll down, copy that CSS
        popup = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP')))
        for i in range(0, 10):
            time.sleep(1)
            self.driver.e
            xecute_script(
                'arguments[0].scrollTop = arguments[0].scrollHeight', popup
            )  #excecuting javascripts, this tells us scroll down to the bottom of the followers list. So the scroller is at the bottom
        popup = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP')))
        print('cool')
        b_popup = bs4(popup.get_attribute('innerHTML'), 'html.parser')
        for p in b_popup.findAll(
                'li',
            {'class': 'woI9H'
             }):  # findAll means find all the attributes under that tag
            print(
                p.findAll('a')[0]['href']
            )  # So li tags means all the followers and 'a' tag is in that li tag to and can be used to find href of the li or followers to get to their link by savuing all href in a li
            print('Awesome')
        print('end')

コード例 #52

0

ファイルを表示

ファイル: WorldCDC.py プロジェクト: KKodiac/Covid19_Stats

class WorldCovid:
    def __init__(self):
        self.today = ctime(time())
        self.scrape_url = "https://www.worldometers.info/coronavirus/"
        self.datafile = f"./data/World/new_covid_dat.csv"
        self.appendfile = f"./data/World/world_timeseries/{self.today[4:10]+self.today[-5:]}.csv"
        self.is_updated = path.isfile(self.appendfile)


    def getData(self) -> list, list:
        page = requests.get(self.scrape_url)
        html = bs4(page.text, 'html.parser')
        table = html.find(id="main_table_countries_today")

        thead_all = table.thead.find_all('th')

        thead = [th.text for th in thead_all]

        tbody_all = table.find_all('tbody')
        tr_temp = [tr for tr in tbody_all[0].find_all('tr')]
        td_temp = [td.find_all('td') for td in tr_temp]
        tbody = [[j.text.strip() for j in i] for i in td_temp]

        return thead, tbody

コード例 #53

0

ファイルを表示

def getblog(value):
    try:
        #getting all the values
        if (value == 'all'):
            x = requests.get('https://www.freecodecamp.org/news/').text
        else:
            # getting the response based on particular tag
            x = requests.get(f"https://www.freecodecamp.org/news/tag/{value}").text
        soup=bs4(x,'lxml')
        hack = soup.find_all('article',class_ = 'post-card')
        #intitializing a dictionary
        val={}
        val["dic"]=[]
        for i in range(0,len(hack)):
            data={}
            data["Tag"] = hack[i].find('span',class_='post-card-tags').text.strip(' \t\n\r')
            data["Blog-Title"] = hack[i].find('h2',class_='post-card-title').text.strip(' \t\n\r')
            data["Blog-link"] = hack[i].find('a',class_='post-card-image-link').get('href')
            data["Blog-link"]="https://www.freecodecamp.org"+data["Blog-link"]
            data["Author"] = hack[i].find('a',class_='meta-item').text.strip(' \t\n\r')
            val["dic"].append(data)
        return val
    except Exception as e:
        return {"status":False,"error":e}

コード例 #54

0

ファイルを表示

    def main(self):
        params = dict(pets_cat=1, max_price=2000)
        rsp = requests.get(self.url, params=params)
        html = bs4(rsp.text, 'html.parser')
        apts = html.find_all('p', attrs={'class': 'row'})
        for apt in apts:
            #    print apt.prettify()
            size = apt.findAll(attrs={'class': 'housing'})[0].text
            sqft, beds = self.find_size_and_bdrms(size)
            self.apartment['sqft'] = sqft
            self.apartment['beds'] = beds
            self.apartment['updated_datetime'] = apt.find('time')['datetime']
            self.apartment['price'] = float(apt.find('span', {'class': 'price'}).text.strip('$'))
            self.apartment['title'] = apt.find('a', attrs={'class': 'hdrlnk'}).text
            self.apartment['url'] = 'h'+self.url.strip('/search/apa') + apt.find('a', attrs={'class': 'hdrlnk'})['href']
            info = self.get_more_info(self.apartment['url'])
            

            for k,v in self.apartment.iteritems():
                print k,v
            print '\n'

            exit()
            time.sleep(1)

コード例 #55

0

ファイルを表示

        song_list.append(song)
        i += 2
    for i in range(len(song_list)):
        print(str(i + 1) + ': ' + song_list[i]['name'])
    choice = int(input('Enter the song number to download: '))
    # todo: add choice to download entire album also
    # todo: check for invalid input
    choice -= 1
    return song_list[choice]['url'], song_list[choice]['name']


if __name__ == '__main__':
    url = 'https://www.starmusiq.fun/search/search-for-blocked-movies-starmusiq.html'
    query = input('Enter album name: ')
    search_result = requests.get(url, params={'query': query})
    soup = bs4(search_result.text, 'html.parser')
    albums_container = soup.find("div", {"id": "search_albums"})
    search_result_links = albums_container.findAll(
        'a', {'class': 'label label-danger'})
    choice = display_results(search_result_links)
    while choice[0] != 1 and choice[0] != 2:
        choice = display_results(search_result_links)
    if choice[0] == 1:
        url = choice[1]
        album_page = requests.get(url)
        soup = bs4(album_page.text, 'html.parser')
        links = soup.findAll(
            'a', {
                'style':
                'background:#cb413f;color:#fff;line-height:39px;padding:8px 6px;text-decoration:'
                + 'none;border-radius:.25em; font-weight:700;'

コード例 #56

0

ファイルを表示

#pp = pprint.PrettyPrinter(indent=4).pprint

chrome_options = Options()
#chrome_options.add_extension("proxy.zip")
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(executable_path='chromedriver.exe',
                          chrome_options=chrome_options)

url = 'https://xxxxxx'
#url2 = 'https://xxxxxx'
url3 = 'https://xxxxxx/Extracted/'

driver.get(url)
driver.get(url3)
page = driver.page_source
soup = bs4(page, 'html.parser')

parsesoup = re.findall(
    r"(.*href)*=\"(.*)(mkv).*(.mkv\")(.*)(\d{4}-[a-zA-Z]{3}-\d{2} \d{2}:\d{2})",
    page)

tables = soup.findChildren('table')
tables1 = tables[0]
rows = tables1.findChildren(['tr'])
dates = []

for row in rows:
    cells = row.findChildren('td')
    for cell in cells:
        value = cell.string
        dates.append(value)

コード例 #57

0

ファイルを表示

def addnewtemplate():
    def templatelookup():
        business = str(session['business'])
        con = sqlite.connect('db/db1.db')
        with con:
            cur = con.cursor()
            cur.execute('PRAGMA key = ' + dbkey + ';')
            templatelist = []
            for row in cur.execute(
                    'select name from templates where business = (?) or shared = 1;',
                (business, )):
                templatelist.append(row[:][0])
            con.close
        return templatelist

    Path("./templates/businesses/" + str(session['business'])).mkdir(
        parents=True, exist_ok=True)

    if request.method == "POST":
        print(request.form.get('templateview'))
        if str(request.form.get('templateview')) != 'None':
            searchtemplates = templatelookup()
            print(searchtemplates)
            print(request.form.get('templateview'))
            templateview = request.form.get('templateview')
            if templateview == 'prototype2':
                templateview = '/templates/prototype2.html'
                searchtemplates = templatelookup()
                return render_template('addtemplate.html',
                                       searchtemplates=searchtemplates,
                                       templateview=templateview)
            else:
                templatecustom = 'businesses+^+' + session[
                    'business'] + '+^+' + templateview + '.html'
                searchtemplates = templatelookup()
                return render_template('addtemplate.html',
                                       searchtemplates=searchtemplates,
                                       templatecustom=templatecustom)

        if request.form.get('editordata') != None:
            try:
                savehtml = request.form.get('editordata')
                soup = bs4(savehtml)
                for a in soup.findAll('a'):
                    a['href'] = "replacelink"
                    a['data-saferedirecturl'] = 'replacelink'
                savehtml = str(soup)
                savehtmlnam = str(request.form.get('templatename'))
                savehtmlnam = savehtmlnam.replace(' ', '_')
                savehtmlname = savehtmlnam + '.html'
                templatesubject = request.form.get('templatesubject')
                if os.path.isfile('./templates/businesses/' +
                                  session['business'] + '/' + savehtmlname):
                    flash('A template with this name already exists',
                          'category2')
                    return render_template("addtemplate.html",
                                           searchtemplates=searchtemplates)
                else:
                    with open(
                            './templates/businesses/' + session['business'] +
                            '/' + savehtmlname, 'w') as f:
                        f.write(savehtml)
                    con = sqlite.connect('db/db1.db')
                    with con:
                        cur = con.cursor()
                        cur.execute('PRAGMA key = ' + dbkey + ';')
                        cur.execute(
                            'insert into templates (business, name, emailsubject) VALUES (?,?,?);',
                            (session['business'], savehtmlnam,
                             templatesubject))
                        con.commit
                    con.close
                    flash('Submitted!', 'category2')
                    return render_template("addtemplate.html",
                                           searchtemplates=searchtemplates)
            except:
                searchtemplates = templatelookup()

        if request.form.get('selecttemplate') != 'Templates':
            if request.form.get('selecttemplate') != None:
                selecttemplate = request.form.get('selecttemplate')
                if selecttemplate == 'prototype2':
                    flash('No deleting default templates', 'category2')
                else:
                    con = sqlite.connect('db/db1.db')
                    with con:
                        cur = con.cursor()
                        cur.execute('PRAGMA key = ' + dbkey + ';')
                        cur.execute(
                            'delete from templates where business LIKE (?) and name LIKE (?);',
                            (
                                session['business'],
                                selecttemplate,
                            ))
                    con.close()
                    os.remove('./templates/businesses/' + session['business'] +
                              '/' + selecttemplate + '.html')
                    flash('Deleted!', 'category2')

    searchtemplates = templatelookup()
    print(searchtemplates)

    return render_template("addtemplate.html", searchtemplates=searchtemplates)

コード例 #58

0

ファイルを表示

def download_one(xkcd_dict, xkcd_num):
    if not xkcd_dict:
        return None

    xkcd_number = str(xkcd_num)
    if xkcd_number in excludeList:
        downloadImage = False
        print('{num} is special. It does not have an image.'.format(
            num=xkcd_number))
        '''
        [2] Some comics are special and either don't have an image or have a dynamic one.
            The full list is the array excludeList and needs to be manually updated upon the release
            of such a comic.
        '''
    else:
        downloadImage = True
    if xkcd_number in xkcd_dict:
        date = xkcd_dict[xkcd_number]['date-published']
        description = xkcd_dict[xkcd_number]['description']

        new_description = sanitize_description(description)

        new_folder = '{current_directory}/xkcd_archive/{name}'.format(
            current_directory=WORKING_DIRECTORY, name=xkcd_number)

        to_download_single = "{base}/{xkcd_num}/".format(base=BASE_URL,
                                                         xkcd_num=xkcd_number)
        print(
            "Downloading xkcd from '{img_url}' and storing it under '{path}'".
            format(img_url=to_download_single, path=new_folder))
        alt = requests.get(to_download_single + 'info.0.json').json()['alt']
        if os.path.exists(new_folder):
            print("xkcd  number '{num}' has already been downloaded!".format(
                num=xkcd_number))
        else:
            os.makedirs(new_folder)
            os.chdir(new_folder)
            with open('description.txt', 'w') as f:
                content = """title : {description}
date-published: {date}
url: {url}
alt: {altText} \n""".format(description=description,
                            date=date,
                            url=to_download_single,
                            altText=alt)
                f.write(content)

            image_page = requests.get(to_download_single, stream=True)
            if downloadImage:
                if image_page.status_code == 200:
                    image_page_content = image_page.content
                    image_page_content_soup = bs4(image_page_content,
                                                  'html.parser')

                    for data in image_page_content_soup.find_all(
                            "div", {"id": "comic"}):
                        for img_tag in data.find_all('img'):
                            img_link = img_tag.get('src')

                    complete_img_url = "http:{url}".format(url=img_link)

                    file_name = "{description}.jpg".format(
                        description=new_description)
                    r = requests.get(complete_img_url, stream=True)
                    if r.status_code == 200:
                        with open(file_name, 'wb') as f:
                            r.raw.decode_content = True
                            shutil.copyfileobj(r.raw, f)
                    else:
                        print("Error with connectivity. HTTP error {}".format(
                            r.status_code))
                    magic_response = str(magic.from_file(file_name, mime=True))
                    if 'png' in magic_response:
                        os.rename(
                            file_name, "{description}.png".format(
                                description=new_description))
                    elif 'jpeg' in magic_response:
                        os.rename(
                            file_name, "{description}.jpeg".format(
                                description=new_description))
                    elif 'gif' in magic_response:
                        os.rename(
                            file_name, "{description}.gif".format(
                                description=new_description))

    else:
        print("{} does not exist! Please try with a different option".format(
            xkcd_number))

コード例 #59

0

ファイルを表示

def truePeopleSearch():
    
    print("\n STEP 2: Webscraping")
    
    # create url variable of web address
    url='https://www.truepeoplesearch.com/'
    
    # prepare df for webscrape
    
    print('\n Starting truepeoplesearch.com')
    
    #zone = [', AL',', AR',', GA',', LA',', MS',', NC',', TN',', WV']
    
    # could possibly add an area that would emcumpass possible resutls
    #area = ['AL','AR','GA','LA','MS','NC','TN','WV']
    
    # define dedup function to dedup lists
    def dedup(seq):
        """
        removes duplicate values from a list or 
        duplicate characters from a string 
        """
        if type(seq) == list:
            seen = set()
            seen_add = seen.add
            return [x for x in seq if not (x in seen or seen_add(x))]    
        elif type(seq) == str:
            seen = set()
            seen_add = seen.add
            return ''.join([x for x in seq if not (x in seen or seen_add(x))])    
        else:
            print("Currently function can only handle lists and strings")
    
    # create empty dictionary and list  
    # ps, hardest webscraped i have ever done. the captch problem makes it even worse
    fullDict = {}
    nameList = []
    for key, value in beneDict.items(): 
        nameDict = {}
        counter = -1
        if value[1] not in nameList:
            print('Searching TruePeopleSearch for %s'%  value[1])
            nameList.append(value[1])
            splitName = '%20'.join(value[1].split())
        #    for k, kvalue in enumerate(replaceList)
            cleanName = value[1]
            first = value
        #    splitName = 'john%20smith'
    #        print(splitName)
            flag = 'Y'
            pageCount = 0
            while flag == 'Y':
    #            print(flag)
                pageCount +=1
                tempURL = url+'results?name='+splitName+'&citystatezip=%s&page=%s' % (value[2], pageCount)
                pageContent = requests.get(tempURL).content
        #        time.sleep(1)
    #            print(tempURL)
                if 'captchasubmit?returnUrl' in str(pageContent): 
    #                print('first')    
                    while 'captchasubmit?returnUrl' in str(pageContent):
                    
                        print('captcha found')
                        def afterCaptcha():
                            global captcha, pageContent
                            pageContent = requests.get(tempURL).content
                            print("Program will continue until another captcha is requested")
                            captcha.destroy()
                            
                        def quitTkinterBox():
                            global captcha
                            captcha.destroy()
                            print("raise SystemExit")
                        
                        captcha = Tk()
                        captcha.lift()
                        
                        captcha.attributes('-topmost',True)
                        
                        captcha.after_idle(captcha.attributes,'-topmost',False)
        
    #                    messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n  solve the captcha")
                        
                        Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W)
                        
                        Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1)
                        
                        Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4)
                    
                        captcha.mainloop()  
                        
                    print("Continuing")
                    pageContent = requests.get(tempURL).content
        #                time.sleep(1)
                    soup = bs4(pageContent, "html.parser")
    #                print(soup)
                    linkList = []
                    diffList = []
                    if str(soup).find('btnNextPage') == -1:
                        flag = 'N'
                    for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}):
                        if str(value[5]) in card.text: 
                            for h4 in card.find_all(attrs= {'class':'h4'}):
    #                            print(value[1])
    #                            print(h4.text.strip().upper())
                                result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio()
    #                            print(result)
                            for a in card.find_all('a'):
                                if 'name' in a['href']:
                                    if 'page' not in a['href']:
                                        if a['href'] not in linkList:
        #                                    if result > .5:
                                                diffList.append(result)
                                                linkList.append(a['href'])
                        else:
                            None
                        
                            
                else:
                    soup = bs4(pageContent, "html.parser")
    #                print(soup)
                    linkList = []
                    diffList = []
                    if str(soup).find('btnNextPage') == -1:
                        flag = 'N'
                    for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}):
                        if str(value[5]) in card.text: 
                            for h4 in card.find_all(attrs= {'class':'h4'}):
    #                            print(value[1])
    #                            print(h4.text.strip().upper())
                                result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio()
    #                            print(result)
                            for a in card.find_all('a'):
                                if 'name' in a['href']:
                                    if 'page' not in a['href']:
                                        if a['href'] not in linkList:
        #                                    if result > .5:
                                                diffList.append(result)
                                                linkList.append(a['href'])
                        else:
                            None
        
                for i, ivalue in enumerate(linkList):
                    counter += 1
                    infoDict = {}
                    tempURL = url+linkList[i]
                    pageContent2 = requests.get(tempURL).content
        #            time.sleep(1)
                    if 'captchasubmit?returnUrl' in str(pageContent2): 
    #                    print('first')
                        while 'captchasubmit?returnUrl' in str(pageContent2):
                        
                            print('captcha found')
                            def afterCaptcha():
                                global captcha, pageContent2
                                pageContent2 = requests.get(tempURL).content
        #                        time.sleep(1)
                #                print("Failed to solve captcha. Ending program. Please try again.")
                                print("Program will continue until another captcha is requested")
                                captcha.destroy()
                                
                            def quitTkinterBox():
                                global captcha
                                captcha.destroy()
                                print("raise SystemExit")
                            
                            captcha = Tk()
                            captcha.lift()
                            
                            captcha.attributes('-topmost',True)
                            
                            captcha.after_idle(captcha.attributes,'-topmost',False)
            
    #                        messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n  solve the captcha")
                        
                            Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W)
                        
                            
                            Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1)
                            
                        #    Button(master, text='Run', command=importNums).grid(row=4, column=1, sticky=W, pady=1)
                            Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4)
                        
                            captcha.mainloop()  
                            
                        print("Continuing")
                        soup = bs4(pageContent2, "html.parser")
                        phoneList = []
    #                    print(ivalue)
                        infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip()
                        infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip()
                        infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip()
                        infoDict['match'] = value[0]
                        infoDict['origFullName'] = value[1]
                        for a in soup.find_all('a'):
                            if 'phoneno' in a['href']:
                                phone = a['href'][a['href'].find('=')+1:]
                                if phone not in phoneList:
                                    phoneList.append(phone)
                        infoDict['phone'] = phoneList
                        infoDict['source'] = 'TPS'
                        infoDict['diff'] = diffList[i]
    
        
                    else:
                        
                        soup = bs4(pageContent2, "html.parser")
                        phoneList = []
    #                    print(ivalue)
                        infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip()
                        infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip()
                        infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip()
                        infoDict['match'] = value[0]
                        infoDict['origFullName'] = value[1]
    
                        for a in soup.find_all('a'):
                            if 'phoneno' in a['href']:
                                phone = a['href'][a['href'].find('=')+1:]
                                if phone not in phoneList:
                                    phoneList.append(phone)
                        infoDict['phone'] = phoneList
                        infoDict['source'] = 'TPS'
                        infoDict['diff'] = diffList[i]
                    nameDict[counter] = infoDict
                fullDict[value[0]] = nameDict 
        else:
            None
    return fullDict

コード例 #60

0

ファイルを表示

 driver = webdriver.PhantomJS(executable_path=r'S:\DA_work_files\DA_Work_Python\phantomjs-2.1.1-windows\bin\phantomjs.exe')
 driver.set_window_size(1124, 850) # set browser size.
 # use driver to get url
 driver.get(url)
 
 #Find the search box and input the name
 nameInput = driver.find_element_by_id('fullName')
 nameInput.send_keys(name)
 
 # click on search button
 submit = driver.find_element_by_css_selector('button.btn.btn-lg.btn-block').click()
 
  
 #get current page
 page_content = requests.get(driver.current_url).content
 soup = bs4(page_content, "html.parser")
 
 
 try:
     #finding the number or total results
     for line in soup.find_all("span",class_="ThatsThem-results-preheader"):
         results1 = ''.join(line.find_all(text=True))
         
     results=results1[7:10]
     results=int(results.strip())
     
     # Grab name, Address, Phone number, Email for each result
     name=[]
     address1=[]
     address2=[]
     address3=[]