コード例 #1
1
ファイル: scraper.py プロジェクト: slarrain/MachineLearning
def data_per_case (single_case):
    year = single_case['citation']['year']
    docket = single_case['docket_number']
    facts = single_case['facts_of_the_case']
    question = single_case['question']

    f = bs4(facts).text.replace('\n', '').replace('\xa0', '')
    q = bs4(question).text.replace('\n', '').replace('\xa0', '')

    return [year, docket, f, q]
コード例 #2
1
ファイル: scrapelr.py プロジェクト: tled/dumplr
 def __SetupPage(self):
     """Inits the bs4 property"""
     page = bs4(self.__FetchPage(self.url),HTML_PARSER) 
     # put iframes into the page object, some blog themes require this
     for frame in page.find_all('iframe'):
         try:
             frame_content = bs4(self.__FetchPage(frame['src']),HTML_PARSER)
             frame.replace_with(frame_content.body)
         except KeyError:
             pass
     self.page = page
コード例 #3
1
ファイル: scrape.py プロジェクト: bensw/etrescrape
def main():

	now = str(datetime.datetime.now())
	conn = sqlite3.connect(DIR + "scrape.db")
	conn.row_factory=sqlite3.Row
	
	c = conn.cursor()
	changes = {}
	new_beers = []
	items = []

	# Add beers to items from each url
	for url in URLS:
		soup = bs4(urllib2.urlopen(url).read())
		print "Found %s Items..." % (total_items(soup))
		items += get_items(soup)

		# See if there are multiple pages		
		page = 2
		while (int(total_items(soup)) > len(items)):
			items += get_items(bs4(urllib2.urlopen(url + "&sort=20a&page=%d" % page ).read()))
			page += 1

	# Loop over beers found
	for item in items:
		# See if the beer exists in the database
		entry = c.execute("SELECT * FROM beers WHERE name = ?", [item['name']]).fetchall()
		if (len(entry) == 0): # If it doesn't insert it into the data base 
			c.execute("INSERT INTO beers (last_updated, name, qty, price, etreId) VALUES (?, ?, ?, ?, ?)", [now, item['name'], item['qty'], item['price'], item['etreId']])
			new_beers.append({"name":item['name'], "qty":item['qty'], "price":item['price'], "etreId":item['etreId']})
			# print "New beer found! name: %s qty: %d price: %f" % (item['name'], item['qty'], item['price'])
		elif (len(entry) == 1): # If it does exist
			e = entry[0]
			# Loop over the keys that are important (not id, time, etreId)
			#print e.keys()
			for key in e.keys()[1:-2]: 
				if e[key] != item[key]:
					if item['name'] in changes.keys():
						changes[item['name']][key] = [str(e[key]), str(item[key])]
					else:
						changes[item['name']] = {key:[str(e[key]), str(item[key])], 'etreId': item['etreId']}
			c.execute("UPDATE beers SET name=?, qty=?, price=?, last_updated=?, etreId=? WHERE id = ?", [item['name'], item['qty'], item['price'], now, item['etreId'], entry[0][0]])
		

	# Rendering
	#print changes, new_beers
	render(changes, new_beers)

	# Commit and close the db cursor
	conn.commit()
	conn.close()
コード例 #4
1
ファイル: scriptunio.py プロジェクト: jotacor/tradunio
def get_new_players():
    ''' Obtiene los fichajes y ventas de la liga '''
    session = requests.session()
    session.get('http://stats.comunio.es/transfers.php', headers=headers)
    soup = bs4(session.get('http://stats.comunio.es/transfers.php', headers=headers).content)
    new_members = True
    for table in soup.find_all('table', {'class': 'rangliste'}):
        if new_members:
            for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]:
                nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href'])
                number = nuna.group(1)
                name = nuna.group(2).replace("+", " ").strip()
                club = row.find('td', {'class': 'clubPic'}).a['href']
                club_id = re.search("([0-9]+)-(.*)", club).group(1)
                club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ")
                position = _position_translation(row.contents[6].text)
                db.commit_query('INSERT IGNORE INTO players (idp, name, position, idcl) VALUES (%s, "%s", %s, %s)' % (
                number, name, position, club_id))
                get_all_prices(name, incremental=True)
                print 'Alta jugador %s (%s) en el club %s (%s) como %s (%s)' % (
                name, number, club_name, club_id, row.contents[6].text, position)
            new_members = False
        else:
            for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]:
                nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href'])
                number = nuna.group(1)
                name = nuna.group(2).replace("+", " ").strip()
                club = row.find('td', {'class': 'clubPic'}).a['href']
                club_id = re.search("([0-9]+)-(.*)", club).group(1)
                club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ")
                db.commit_query('UPDATE players SET idcl=NULL WHERE idp=%s' % (number))
                print 'Baja jugador %s (%s) del club %s (%s)' % (name, number, club_name, club_id)
コード例 #5
1
 def __format_bidding(self, bidding):
     """Convert bidding data to properly formatted HTML table."""
     log.getLogger('b_format').debug('formatting bidding: %s', bidding)
     bid_match = re.compile(r'(\d)([SHDCN])')
     html_output = bs4('<table>', 'lxml')
     header_row = html_output.new_tag('tr')
     html_output.table.append(header_row)
     for direction in self.__directions:
         header_cell = html_output.new_tag('th')
         header_cell.string = direction
         header_row.append(header_cell)
     for bid_round in bidding:
         bidding_row = html_output.new_tag('tr')
         html_output.table.append(bidding_row)
         for bid in bid_round:
             bid_cell = html_output.new_tag('td')
             call_match = re.match(bid_match, bid)
             if call_match:
                 bid_cell.append(call_match.group(1))
                 bid_icon = html_output.new_tag(
                     'img', src='images/' + call_match.group(2) + '.gif')
                 bid_cell.append(bid_icon)
             else:
                 if bid == 'SkipBid':
                     bid = '( - )'
                 bid_cell.append(bid)
             bidding_row.append(bid_cell)
         log.getLogger('b_format').debug('%5s' * 4, *bid_round)
     return html_output.table.prettify()
コード例 #6
1
    def query(self, query):
        result = {
            'result': [],
        }

        http = ModuleManager.call_module_method(
            'http_lib',
            'get',
            'https://encrypted.google.com/search?q=%s&num=10000' % quote(query)
        )

        if not 'html' in http:
            result['error'] = 'No server responce'
            return result

        soup = bs4(http['html'])

        for title in soup.find_all('h3', attrs={'class': 'r'}):
            element = title.find('a')
            # parse google url
            g_url = urlparse(element['href'])
            g_args = parse_qs(g_url.query)
            url = g_args['q'][0]

            result['result'].append({
                'title': element.text,
                'url': url,
            })

        return result
コード例 #7
0
	def post(self, request):
		symbols = request.POST['symbols'].split(',')
		for value in range(len(symbols)):
			symbols[value] = symbols[value].strip()
		roc = []
		pe_ratio = []
		for value in symbols:
			value = value.upper()
			# result = r.get(self.lookup_url + value).json()
			url = r.get("http://174.129.18.141/companies/" + value + "/pe_ratio")
			soup = bs4(url.text)
			pe_text = soup.select('span#pgNameVal')
			pe_text = pe_text[0].text
			pe_text = re.split('\s+', pe_text)
			for i in range(1):
				pe_ratio.append(float(pe_text[i]))
			url = r.get("http://www.gurufocus.com/term/ROC_JOEL/" + value + "/Return%252Bon%252BCapital%252B%252B-%252BJoel%252BGreenblatt/")
			soup = bs4(url.text)
			for div in soup.select('.data_value'):
				roc.append(float(div.get_text()[:-19]))
		magic_dict = {}
		counter = -1
		for value in symbols:
			counter+=1
			magic_dict[value] = {"magic number":roc[counter]-pe_ratio[counter]}
		print(magic_dict)
		return JsonResponse({'magic_dict':magic_dict})
コード例 #8
0
ファイル: kou.py プロジェクト: madhatter1605/fun
    def do_grades(self, ln):
        if self.is_login:
            try:
                self.response['grades'] = self.session.get(GRADE_URL)
                current_term = bs4(self.response['grades'].text, 'lxml') \
                    .find('option', selected=True)['value']

                self.request['Donem'] = current_term
                self.request['Ara'] = 'Listele'
                self.response['grades'] = self.session.post(GRADE_URL,
                                                            data=self.request)

                table = bs4(self.response['grades'].text, 'lxml') \
                    .find_all('table',
                              'table table-bordered '
                              'table-condensed')[0]

                parsed_table = table.findChildren(['th', 'tr'])
                out_table = PrettyTable(
                    ['DERS', 'AKTS', 'VIZE', 'FIN', 'BÜT', 'BN'])

                for i in range(1, len(parsed_table)):
                    a = [str(n.text).strip()
                         for n in parsed_table[i].findChildren('td')]
                    a = [a[2].split('(')[0]] + a[3:8]
                    out_table.add_row(a)
                print(out_table)
            except:
                print(colored(">>>> Notlar işlenirken hata oluştu", 'red'))
        else:
            self.do_login(ln)
            self.do_grades(ln)
コード例 #9
0
ファイル: module_steamsale.py プロジェクト: jmbjr/pyfibot
def command_price(bot, user, channel, args):
    """.price [Steam game name] - Find whether a currently on-sale Steam game has ever been on sale for cheaper"""
    search = args.replace(" ","+")
#     req = urllib2.Request("http://store.steampowered.com/search/?term=%s&category1=998" % search, headers={"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
#     db = bs4(urllib2.urlopen(req).read())
    db = bs4(urllib.urlopen("http://store.steampowered.com/search/?term=%s&category1=998" % search))
    row = db.find(class_ = "search_result_row")
    appid = row['data-ds-appid']
    xml = requests.get("http://steamsales.rhekua.com/xml/sales/app_%s.xml" % appid)
    pricehist = bs4(xml.text)
    
    name = row.find("span", {'class': 'title'}).string
    price = row.find(class_ = "search_price").text
    price = price[price.rfind('$'):]

    current = float(price[1:])
    
    lowest = current
    date = "never"
    for entry in pricehist.find_all('set'):
        price = float(entry['value'])
        if price < lowest:
            lowest = price
            date = entry['name']
        elif price == lowest:
            if not date == "never":
                date = entry['name']
    
    if lowest == current:
        bot.say(channel, name + " has never been cheaper than the $" + str(current) + " it is right now!")
    else:
        bot.say(channel, name + " is $" + str(current) + " now, but was $" + str(lowest) + " on " + date)
コード例 #10
0
ファイル: crawl.py プロジェクト: pakoo/shantie
def get_all_city_info():
    r = requests.get('http://www.pm25.in/rank')
    soup = bs4(r.content)
    citys = soup.find_all('tr')[1:]
    today = datetime.datetime.now()
    now = datetime.datetime(today.year,today.month,today.day,today.hour)
    for c in citys:
        td = c.find_all('td')
        print 
        data = {'name':td[1].a.string,'name_py':td[1].a['href'][1:]}
        data['rank'] = td[0].string
        data['AQI'] = td[2].string
        data['level'] = td[3].string
        data['PM25'] = td[5].string
        data['PM10'] = td[6].string
        data['CO'] = td[7].string
        data['SO2'] = td[11].string
        data['time'] = now
        print data
        mdb.con['air'].pmcn.update(
                    {'name':data['name']},
                    {'$set':{
                    'AQI':data['AQI'],
                    'PM25':data['PM25'],
                    'PM10':data['PM10'],
                    'CO':data['CO'],
                    'SO2':data['SO2'],
                    'rank':data['rank'],
                    'level':data['level'],
                    'time':data['time']
                    }
                    },
                    upsert=True)
コード例 #11
0
def get_image_link_and_title():
    try:
        page = urllib2.urlopen(
            "http://photography.nationalgeographic.com/photography/photo-of-the-day/")
    except urllib2.URLError:
        # Looks like it didn't work, just return from the function 
        # and try again at the next interval
        print "there was an error opening the url"
        return None, None
    soup = bs4(page)
    try:
        link = soup.find('div', class_='download_link').find('a').attrs['href']
    except AttributeError:
        #looks like there wasn't a download link. 
        # just grab the low-res image instead
        try:
            link = soup.find('div', class_='primary_photo').find('img').attrs['src']
        except AttributeError:
            # couldn't find the right div, 
            # try again next interval (not a robust solution)
            print "there was an error parsing the html"
            return None, None


    # now, prepend http onto the url
    link = 'http:{0}'.format(link)
    title = soup.find('div', id='page_head').find('h1').contents[0].encode('ascii', 'ignore')
    return link, title
コード例 #12
0
ファイル: get_data.py プロジェクト: herch0/avito_data
def get_details_apprt(url):
	r_details = requests.get(url)
	html = bs4(r_details.text, 'html.parser')
	titre = re.sub('\s+', ' ', html.select('.page-header.mbm')[0].text)
	prix = html.select('.vi-price-label .amount')
	if len(prix) == 0:
		return None
	prix = prix[0]['title']
	aside_infos = html.select('aside.panel.panel-body.panel-info')[0].text
	m = re.match(r'.*?Nombre de pièces: ([0-9]+).*?', aside_infos, re.DOTALL)
	if m:
		nb_pieces = m.group(1)

	m = re.match('.*?Surface: (.*? m²).*?', aside_infos, re.DOTALL)
	if m:
		surface = m.group(1)
	m = re.match(r'.*?Secteur: ([\w_-]+).*?', aside_infos, re.DOTALL)
	if m:
		secteur = m.group(1)
	m = re.match(r".*?Adresse: ([\w'_\s-]+).*?Type", aside_infos, re.DOTALL)
	if m:
		adresse = re.sub(r"\s+", " ", m.group(1))

	date = html.select('.date.dtstart.value')[0]['title']
	annee_mois = extract_date_infos(date)
	annee = annee_mois[0]
	mois = annee_mois[1]
	annonce_details = {'titre': titre, 'prix': prix, 'nb_pieces': nb_pieces, 'surface': surface, 'secteur': secteur, 'adresse': adresse, 'annee': annee, 'mois': mois}
	return annonce_details
コード例 #13
0
ファイル: findfeed.py プロジェクト: alabarga/SocialLearning
def findfeed(site):
    raw = requests.get(site).text
    result = []
    possible_feeds = []
    html = bs4(raw)
    feed_urls = html.findAll("link", rel="alternate")
    if len(feed_urls) > 1:
        for f in feed_urls:
            t = f.get("type",None)
            if t:
                if "rss" in t or "xml" in t:
                    href = f.get("href",None)
                    if href:
                        possible_feeds.append(href)
    parsed_url = urllib.parse.urlparse(site)
    base = parsed.scheme+"://"+parsed_url.hostname
    atags = html.findAll("a")
    for a in atags:
        href = a.get("href",None)
        if href:
            if "xml" in href or "rss" in href or "feed" in href:
                possible_feeds.append(base+href)
    for url in list(set(possible_feeds)):
        f = feedparser.parse(url)
        if len(f.entries) > 0:
            if url not in result:
                result.append(url)
    return(result)
コード例 #14
0
def categorieIgnore(url, ignoreCategories):
    page = bs4(get(url).text, "lxml")
    categories = page.select("#content ul")[1].select("a")
    for categorie in categories:
        if categorie.get_text().split("/")[0] in ignoreCategories:
            return True
    return False
コード例 #15
0
def get_area(url):
    base_url =  'https://www.govtrack.us'

    resp = requests.get(url)
    soup = bs4(resp.text, 'lxml')
    div = soup.find('div', {'id': 'vote_explainer'})
    links = div.findAll('a')
    if len(links) == 0:
        return "Nomination" 

    for a in links:
        #print(a)
        if is_absolute(a['href']):
            #print('is absolute')
            next_url = a['href']
        else:
            #print('relative')
            next_url = base_url + a['href']
        #print("current url: %s\nnext url: " %url, next_url)
        if 'members' in next_url:
            continue
        elif 'govtrack.us' in next_url:
            return get_from(next_url)
        else:
            continue
    raise Exception("Couldnt find link to follow!", url)
コード例 #16
0
ファイル: reference.py プロジェクト: jamesgarfield/MongoBot
    def ud(self):
        if not self.values:
            self.chat("Whatchu wanna know, bitch?")
            return

        try:
            request = pageopen('http://www.urbandictionary.com/define.php',
                               params={'term': ' '.join(self.values)})
            soup = bs4(request.text)
        except:
            self.chat("parse error")
            return

        elem = soup.find('div', {'class': 'meaning'})

        try:
            defn = []
            for string in elem.stripped_strings:
                defn.append(string)
        except:
            self.chat("couldn't find anything")


        if defn:
            # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML
            # entities like &#x27; so use the parser for any stray entities.
            for paragraph in defn:
                wrapped = textwrap.wrap(paragraph, 200)
                for line in wrapped:
                    self.chat(unescape(line))
        else:
            self.chat("couldn't find anything")
コード例 #17
0
ファイル: movie.py プロジェクト: kannibalox/PTPAPI
 def load_html_data(self):
     """Scrape all data from a movie's HTML page"""
     soup = bs4(session.base_get("torrents.php", params={'id': self.ID}).text, "html.parser")
     self.data['Cover'] = soup.find('img', class_='sidebar-cover-image')['src']
     # Title and Year
     match = re.match(r'(.*) \[(\d{4})\]', soup.find('h2', class_='page__title').encode_contents())
     self.data['Title'] = match.group(1)
     self.data['Year'] = match.group(2)
     # Genre tags
     self.data['Tags'] = []
     for tagbox in soup.find_all('div', class_="box_tags"):
         for tag in tagbox.find_all("li"):
             self.data['Tags'].append(tag.find('a').string)
     self.data['Directors'] = []
     for director in soup.find('h2', class_='page__title').find_all('a', class_='artist-info-link'):
         self.data['Directors'].append({'Name': director.string})
     # File list & trumpability
     for tor in self['Torrents']:
         # Get file list
         filediv = soup.find("div", id="files_%s" % tor.ID)
         tor.data['Filelist'] = {}
         basepath = re.match(r'\/(.*)\/', filediv.find("thead").find_all("div")[1].get_text()).group(1)
         for elem in filediv.find("tbody").find_all("tr"):
             bytesize = elem("td")[1]("span")[0]['title'].replace(",", "").replace(' bytes', '')
             filepath = os.path.join(basepath, elem("td")[0].string)
             tor.data['Filelist'][filepath] = bytesize
         # Check if trumpable
         if soup.find(id="trumpable_%s" % tor.ID):
             tor.data['Trumpable'] = [s.get_text() for s in soup.find(id="trumpable_%s" % tor.ID).find_all('span')]
         else:
             tor.data['Trumpable'] = []
コード例 #18
0
ファイル: module_giantbomb.py プロジェクト: jmbjr/pyfibot
def command_gb(bot, user, channel, args):
    """.gb upcoming - Returns any posted upcoming items at GiantBomb.com (it's a website about video games)"""
    global videos
    if args:
        cmds = args.split()
        subcommand = cmds[0]
        if subcommand == "ql":
            bot.say(channel, "Latest QL: %s" % videos['ql'])
        elif subcommand == "feature":
            bot.say(channel, "Latest Feature: %s" % videos['feature'])
        elif subcommand == "sub":
            bot.say(channel, "Latest Subscriber Content: %s" % videos['sub'])
        elif subcommand == "article":
            bot.say(channel, "Latest Article: %s" % videos['article'])
        elif subcommand == "review":
            bot.say(channel, "Latest Review: %s" % videos['review'])
        elif subcommand == "bombastica":
            bot.say(channel, "Latest Bombastica: %s" % videos['bombastica'])
        elif subcommand == "upcoming":
            page = bs4(urllib.urlopen("http://www.giantbomb.com/"))
            upcoming = page.find("dl", {"class": "promo-upcoming"})
            slots = upcoming.find_all("dd")
            if len(slots) == 0:
                bot.say(channel, "No items on the upcoming list! Alert @GiantBombStats!")
            else:
                if len(cmds) > 1 and cmds[1] == "nopat":
                    before = len(slots)
                    slots = [slot for slot in slots if not str(slot.find("h4").text).__contains__("Scoops")]
                    bot.say(channel, "NOPAT MODE ACTIVATED - %s ITEMS ELIMINATED" % (before - len(slots)))
                bot.say(channel, "%d Upcoming Items (times in EST):" % len(slots))
                for slot in slots:
                    text = slot.find("h4").text
                    time = slot.find("p").text
                    bot.say(channel, "%s - %s" % (text, time))
コード例 #19
0
 def extract_mosque(self, mosque, page):
     """Extract Mosque."""
     fields = [field.attname.replace('_id', '')
               for field in Mosque._meta.fields
               if field.attname != 'id']
     mosque_text = re.sub(u"(\u2018|\u2019)", "'", mosque.text)
     mosque_link = '/'.join([root_url, mosque.find('a').get('href').split('../')[-1]])
     log_text = '\nWriting {} to file, from page {}'
     self.stdout.write(log_text.format(mosque_text, page))
     mosque_page = bs4(requests.get(mosque_link).content, 'html.parser')
     rows_selector = '#mosque_info_contents table table tr'
     mosque_info_rows = mosque_page.select(rows_selector)
     values = {}
     # page is a giant table, so go over the rows
     for row in mosque_info_rows:
         cells = row.find_all('td')
         # check we have the right fields
         try:
             key = cells[0].text.replace(':', '').lower().strip().replace(' ', '_')
         except (IndexError, AttributeError):
             import pdb; pdb.set_trace()
             # if no key or replace atribute, probably don't want it
             continue
         if len(cells) == 2 and key in fields:
             values[key] = cells[1].text
     name_address = mosque_page.select('#results h1')
     matches = re.match(r'(?P<name>[^(]*)\(', name_address[0].text)
     values['name'] = matches.group('name').strip()
     values['rating'] = mosque_page.select('.star_rating strong')[0].text
     values['mdpk'] = mosque_link.split('/')[-1]
     self.stdout.write(str(set(values.keys()) ^ set(fields)))
コード例 #20
0
ファイル: main.py プロジェクト: jataggart/xkcd-dl
def update_dict():
    """
    getting the info from the archive page. url="http://xkcd.com/archive/" 
    """
    archive_page = requests.get(ARCHIVE_URL)
    if archive_page.status_code == 200:
        page_content = archive_page.content
        archive_soup = bs4(page_content, "html.parser")

        ## now get all the <a> tags under the div '<div class="box" id="middleContainer">' from the soup object
        for data in archive_soup.find_all("div", {"id": "middleContainer"}):
            ## this gets all the contents inside "<div class="box" id="middleContainer">"
            ## now to get the individual links
            for alinks in data.find_all("a"):  ## tries to get all the <a> tags from the 'data' object
                href = alinks.get("href").strip("/")  ## the href stored is in form of eg: "/3/". So make it of form "3"
                date = alinks.get("title")
                description = alinks.contents[0]
                make_keyvalue_list(href, date, description)

        with open(xkcd_dict_filename, "w") as f:
            json.dump(XKCD_DICT, f)
            print(
                "XKCD link database updated\nStored it in '{file}'. You can start downloading your XKCD's!\nRun 'xkcd-dl --help' for more options".format(
                    file=xkcd_dict_filename
                )
            )

    else:
        print("Something bad happened!")
コード例 #21
0
def getAllLaunches():
  launchEntry = 0
  tableDepth = 0
  launchEvents = list()
  global listCount
  try:
    # Grab the entire page
    launchCalHandle = urllib.request.urlopen('http://www.spaceflightinsider.com/launch-schedule/')
    launchCalHtml = launchCalHandle.read()
    soup = bs4(launchCalHtml, 'html.parser')
    # Cleanup the Launch Entries as a string with consistent spacing, allows
    # better modularization of the script.
    for launchEvent in soup.body.find_all(launch_table)[1:]:
      # Increment the list counter
      listCount += 1
      launchFields = list()
      launchString = re.sub(' +', ' ', launchEvent.prettify().replace('\n', ' ').replace('\r', ''))
      # print(launchString)
      # Get the launchID
      launchFields.append(launchString.split('"launchcalendar" id="')[1].split('"> <tr>')[0].strip())
      # Get the date, bypass non-hard-scheduled launches
      launchFields.append(launchString.split('</span> <span>')[1].split(' </span>')[0].strip())
      if(
      not('Jan' in launchFields[-1]) and not('Feb' in launchFields[-1]) and
      not('Mar' in launchFields[-1]) and not('Apr' in launchFields[-1]) and
      not('May' in launchFields[-1]) and not('Jun' in launchFields[-1]) and
      not('Jul' in launchFields[-1]) and not('Aug' in launchFields[-1]) and
      not('Sep' in launchFields[-1]) and not('Oct' in launchFields[-1]) and
      not('Nov' in launchFields[-1]) and not('Dec' in launchFields[-1])):
        continue
      # Get the time, bypass non-hard-scheduled launches
      if("Time" in launchString):
        if("TBD" in launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip()):
          continue
        else:
          tempTime = splitTimeFields(launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip())
          for timeField in tempTime:
            launchFields.append(timeField)
      else:
        continue
      # Get the Location
      launchFields.append(launchString.split('<th> Location </th> <td>')[1].split('</td>')[0].strip())
      # Get the Satellite
      launchFields.append(launchString.split('<th colspan="2">')[1].split('</th>')[0].strip())
      # Get the Launch Vehicle
      if("<wbr>" in launchString.split('<br/>')[1].split('</td>')[0].strip()):
        launchFields.append(re.sub(' </wbr>', '', re.sub(' <wbr> ', '', launchString.split('<br/>')[1].split('</td>')[0].strip())))
      else:
        launchFields.append(launchString.split('<br/>')[1].split('</td>')[0].strip())
      # Get the description
      launchFields.append(launchString.split('"description" colspan="2"> <p>')[1].split('</p>')[0].strip())
      # Convert Stored Data to writeEvent()
      writeEvent(convertLaunchData(launchFields))
  except urllib.error.HTTPError:
    print("There was an error accessing the Space Flight Insider Launch Schedule.")
    print("The server could be down or having issues. Try again.")
  except urllib.error.URLError:
    print("There was an error decoding the URL for the Space Flight Insider Launch Schedule. :::nodename not known :::")
    print("Check that your computer has access to the Internet.")
コード例 #22
0
ファイル: api.py プロジェクト: kannibalox/PTPAPI
 def contest_leaders(self):
     """Get data on who's winning"""
     LOGGER.debug("Fetching contest leaderboard")
     soup = bs4(session.base_get("contestleaders.php").content, "html.parser")
     ret_array = []
     for cell in soup.find('table', class_='table--panel-like').find('tbody').find_all('tr'):
         ret_array.append((cell.find_all('td')[1].get_text(), cell.find_all('td')[2].get_text()))
     return ret_array
コード例 #23
0
ファイル: d99.py プロジェクト: wenLiangcan/d99
 def _get_volumes_0_1(self, html):
     soup = bs4(html, 'lxml')
     lis = soup.select_one('.vol > .bl').find_all('li')
     vols = [
         (li.a.text, 'http://{}{}'.format(self.domain, li.a.attrs['href']))
         for li in lis
     ]
     return self._sort_vol_by_title(vols)
コード例 #24
0
def categorie_ignore(url, ignore_categories):
    """ ignores defined categories of wallappers  """
    page = bs4(get(url).text, "lxml")
    categories = page.select('#content ul')[1].select('a')
    for categorie in categories:
        if categorie.get_text().split('/')[0] in ignore_categories:
            return True
    return False
コード例 #25
0
ファイル: api.py プロジェクト: kannibalox/PTPAPI
 def log(self):
     """Gets the PTP log"""
     soup = bs4(session.base_get('/log.php').content, "html.parser")
     ret_array = []
     for message in soup.find('table').find('tbody').find_all('tr'):
         ret_array.append((message.find('span', class_='time')['title'],
                           message.find('span', class_='log__message').get_text().lstrip().encode('UTF-8')))
     return ret_array
コード例 #26
0
ファイル: app.py プロジェクト: eunicekokor/devfest16
def check_review(query):
  results = []
  soup = bs4(requests.get(query).text, 'html.parser')
  soup = soup.findAll('p')
  for paragraph in soup:
    paragraph = str(paragraph).replace('<p>','')
    paragraph = str(paragraph).replace('</p>','')
  return soup
コード例 #27
0
ファイル: DMiner.py プロジェクト: revizor1/DMiner
def MainSearch(keyword, base="http://seeker.dice.com", tail="/jobsearch/servlet/JobSearch?op=100&NUM_PER_PAGE=5&FREE_TEXT="):
    """
    Get job listings from main keyword search and returns bs
    """
    url = base + tail + keyword
    resp = urllib.request.urlopen(url)
    soup = bs4(resp.read(), from_encoding=resp.info().get_param('charset'))
    return soup
コード例 #28
0
ファイル: dumpJson.py プロジェクト: spk921/scrapers
def getCareerHistory(url):
    req = re.get(url)
    soup = bs4(req.text, 'html.parser')
    careerHistory        = soup.findAll("div", { "class" : "markets_module bio_career" })
    corporateInformation = soup.findAll("div", { "class" : "markets_module corporate_info" })
    memberShips          = soup.findAll("div", { "class" : "markets_module bio_membership" })

    return str(careerHistory), str(corporateInformation), str(memberShips)
コード例 #29
0
ファイル: arconai.py プロジェクト: josefson/dotfiles
 def shows(self):
     """Read shows into list of Shows."""
     response = requests.get(self.base_url)
     source = response.text
     soup = bs4(source, "html.parser")
     show_tags = soup.select("a[title]")
     show_tags = [tag for tag in show_tags if tag.has_attr("class") is False]
     shows = [Show(tag["title"], tag["href"]) for tag in show_tags]
     return shows
コード例 #30
0
def html_to_text(html):
	soup = bs4(html)
	b = soup.find(id="bodyContent")
	if b:
		#wikipedia page
		return b.get_text()
	else:
		#fall back onto just grabbing all text
		return soup.get_text()
コード例 #31
0
    def stats(self):
        """
        Return all stats associated with a user

        :rtype: A dictionary of stat names and their values, both in string format.
        """
        soup = bs4(
            session.base_get('user.php', params={
                'id': self.ID
            }).text, "lxml")
        stats = {}
        for li in soup.find('span', text='Stats').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            stats[stat] = value
        for li in soup.find('span',
                            text='Personal').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            if value:
                stats[stat] = value
        for li in soup.find('span',
                            text='Community').parent.parent.find_all('li'):
            stat, value = self.__parse_stat(li.text)
            if stat == "Uploaded":
                match = re.search(r'(.*) \((.*)\)', value)
                stats["UploadedTorrentsWithDeleted"] = match.group(1)
                value = match.group(2)
                stat = "UploadedTorrents"
            elif stat == "Downloaded":
                stat = "DownloadedTorrents"
            elif stat == "SnatchesFromUploads":
                match = re.search(r'(.*) \((.*)\)', value)
                stats["SnatchesFromUploadsWithDeleted"] = match.group(1)
                value = match.group(2)
            elif stat == "AverageSeedTime(Active)":
                stat = "AverageSeedTimeActive"
            stats[stat] = value
        return stats
コード例 #32
0
    def getContent(self, ARTICLE_List, record):
        newsLists = []
        articleIDList = []
        driver = webdriver.PhantomJS()
        for articleURL in ARTICLE_List:
            if articleURL in record:
                continue
            sys.stdout.write('\r             ' + ' ' * 65)
            sys.stdout.write('\r    URL: ' + articleURL[:69])
            t.sleep(random.randint(5, 8))
            r = driver.get(articleURL)
            pageSource = driver.page_source
            soup = bs4(pageSource, 'html.parser')
            news = soup.find(class_='main-container')
            content = ""
            title = str(news.find('p').text)
            time = re.split('/', news.find(class_='date-display-single').text)
            datetime = '/'.join(time[:3]) + ' 00:00'
            article = news.find(
                class_='node node-post node-promoted clearfix').findAll('p')

            #filter fault news
            if t.strftime('%Y/%m/%d', t.localtime()) not in datetime:
                continue
            else:
                pass

            for contents in article:
                content += contents.text

            articleID = ''.join(time) + '0000000'
            while articleID in articleIDList:
                articleID = str(int(articleID) + 1)
            articleIDList.append(articleID)
            articleID = 'cld' + articleID
            newsLists.append([articleID, articleURL, title, datetime, content])
        return newsLists
コード例 #33
0
    def APOPND_Ovencoffee_crawler(self):
        ContentList = []

        for hundreds in range(10):
            for tens in range(10):
                for units in range(10):
                    page = str(hundreds) + str(tens) + str(units)
                    url = "http://www.ovencoffee.com.tw/store_list.asp?storeid=" + page
                    '''建立Return機制, 因為可能會建立多組連線'''
                    res = requests.Session()
                    res.keep_alive = False
                    retry = Retry(connect=5, backoff_factor=0.5)
                    adapter = HTTPAdapter(max_retries=retry)
                    res.mount('https://', adapter)
                    res = res.get(url, headers=headers)
                    res.encoding = ("utf-8")

                    soup = bs4(res.text, 'html.parser')
                    # print(soup)
                    data = soup.findAll('p')

                    if data[1].text != "" and data[1].text is not None:
                        name = data[1].text
                        phone = re.split(':', data[2].text)[1]
                        business_time = str.join('',
                                                 re.split(':', data[3].text))
                        ContentList.append([name, phone, business_time])

        Filename = 'APOPND_Ovencoffee.csv'  #這裡要改
        storage_dir = "data/csv/"  #這裡要改

        df = pd.DataFrame(data=ContentList,
                          columns=['Name', 'Phone', 'Business_time'])
        df.to_csv(storage_dir + Filename,
                  sep=',',
                  encoding='utf_8_sig',
                  index=False)
コード例 #34
0
ファイル: sisal.py プロジェクト: simonemastella/SureBetFinder
def scrap(link):
    try:    
        session = HTMLSession()
        with session.get(link) as res:
            res.html.render() 
            soup = bs4(res.html.html, 'html5lib')
            risultato=pd.DataFrame(columns=['tipo','casoF','casoV'])
            tags= soup.findAll("div",{"class":"TabellaEsitiRow-hvzh1w-0 jxwiSe"})
            esdop=tags[0].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"})
            new_row = {'tipo':"1X-2", 'casoF':esdop[3].getText(), 'casoV':esdop[2].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"12-X", 'casoF':esdop[5].getText(), 'casoV':esdop[1].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"2X-1", 'casoF':esdop[4].getText(), 'casoV':esdop[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            gng=tags[1].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"})
            new_row = {'tipo':"GOL/NOGOL", 'casoF':gng[1].getText(), 'casoV':gng[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            over=tags[2].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"})
            under=tags[3].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"})
            new_row = {'tipo':"UNDER/OVER 0.5", 'casoF':under[0].getText(), 'casoV':over[0].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 1.5", 'casoF':under[1].getText(), 'casoV':over[1].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 2.5", 'casoF':under[2].getText(), 'casoV':over[2].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 3.5", 'casoF':under[3].getText(), 'casoV':over[3].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 4.5", 'casoF':under[4].getText(), 'casoV':over[4].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"UNDER/OVER 5.5", 'casoF':under[5].getText(), 'casoV':over[5].getText()}
            risultato = risultato.append(new_row, ignore_index=True)
        return risultato
    except:
        session.close()
        print("Errore nella ricerca DATI su SISAL, cerco di nuovo")
        return scrap(link) 
コード例 #35
0
ファイル: sisal.py プロジェクト: simonemastella/SureBetFinder
def scrapCampionato(num):
    campionato=["https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:21",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:22",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:18",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:153",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:86",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:1",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:79",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:137",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:4",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:3",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:14",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:15",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:29",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:30",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:54",
    "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:190"]
    #italia, champions e europa, inghilterra, spagna, germania, francia, olanda, portogallo
    risultato=pd.DataFrame(columns=['giorno','ora','match','link'])
    session = HTMLSession()
    with session.get(campionato[num]) as res:
        res.html.render() 
        soup = bs4(res.html.html, 'html5lib')
        partite= (soup.findAll("div",{"TabellaEsitiRow-hvzh1w-0 eyTFpO"}))
        for partita in partite:
            match=partita.find("span",{"class":"AvvenimentoDescription-rieyuj-0 clFosV"}).getText().strip()
            dataora=partita.find("span",{"class":"AvvenimentoDate-giucxs-0 iaSisn"}).getText().strip().split(" ")
            link ="https://www.sisal.it"+partita.find("a",{"class":"AvvenimentoDetailWrapper-w9f4wf-0 bhgtKE"}).get("href")
            ora=dataora[2]
            data=dataora[0].split("/")
            new_row = {'giorno':data[0], 'ora':ora, 'match':match,'link':link}
            risultato = risultato.append(new_row, ignore_index=True)
    if len(risultato)!=0:
        return risultato
    else:
        print("SISAL RIPROVO")
        return scrapCampionato(num)
コード例 #36
0
ファイル: data.py プロジェクト: kirillmasanov/tg_price_bot
def get_data(html):
    item_list = []
    soup = bs4(html, 'lxml')
    positions = soup.find_all(
        'div',
        class_=
        'products-view-block js-products-view-block products-view-block-static'
    )
    for position in positions:
        pos_article = position.find('div', class_='col-xs-8 align-right').text
        pos_name = position.find('span', class_='products-view-name-link').text
        pos_price = position.find('div', class_='price-number').text
        data = {
            'article': norm_article(pos_article),
            'name': norm_name(pos_name),
            'price': norm_price(pos_price)
        }
        # print(f'Артикул: {norm_article(pos_article)}')
        # print(f'Наименование: {norm_name(pos_name)}')
        # print(f'Цена: {norm_price(pos_price)} руб.')
        # print('=' * 20)
        # print(data)
        item_list.append(data)
    return item_list
コード例 #37
0
ファイル: functions.py プロジェクト: lunbon/rocketbot
def get_ranks_by_nikname(platform, nikname):
    first_role = 'Unranked (2vs2)'
    second_role = 'Unranked (3vs3)'
    try:
        response = requests.get(url % (platform, nikname))
        html = response.text
        soup = bs4(html, 'html.parser')
        for tab in soup.find_all('table'):
            if 'Playlist' in str(tab):
                table = tab
                break
        playTable = table
        trs = playTable.find_all('tr')
        for tr in trs[1:]:
            if 'Ranked Doubles 2v2' in str(tr.find_all('td')[1]):
                r2v2 = tr.small
                first_role = (str(r2v2).split('\n')[1] + ' (2vs2)').strip()
            if 'Ranked Standard 3v3' in str(tr.find_all('td')[1]):
                r3v3 = tr.small
                second_role = (str(r3v3).split('\n')[1] + ' (3vs3)').strip()

        return (first_role, second_role)
    except:
        return False
コード例 #38
0
def insta(username, gecko_path):
    """
    Download images from instagram
    """

    link = URL + username
    print("Downloading images {}...".format(username))

    with Browser("firefox", headless=True,
                 executable_path=gecko_path) as browser:
        browser.visit(link)
        html = browser.html
        soup = bs4(html, "html.parser")

    data = soup.findAll("img")

    for x in data:
        x = x.get("src")
        os.system(
            f"wget --no-check-certificate -c -N -P Images/{username} {x}")
        print("Downloaded {}".format(x))

    def rename_image_dir(foldername):
        i = 1
        dirName = os.path.join("./Images/", foldername)
        path = os.getcwd() + dirName
        for filename in os.listdir(dirName):
            if not filename.endswith(".jpg"):
                os.rename(
                    os.path.join(path, filename),
                    os.path.join(path, foldername + '_' + str(i) + ".jpg"))
            i += 1

    rename_image_dir(username)

    print("\nFiles downloaded into Images/{}".format(username))
コード例 #39
0
ファイル: hospital_spider.py プロジェクト: danyuzhen/python
def get_baike_text(hospital_list,urls):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'baike.baidu.com',
        "Referer": "https://baike.baidu.com/",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
    }
    for i in range(len(urls)):
        html = req.get(urls[i], headers=headers).content.decode('utf8')
        soup=bs4(html,'lxml')
        des = soup.find('div', {'class': 'lemma-summary'})
        if not des is None:
            des=re.sub('\[[0-9]{1,2}\]', "", des.get_text()).replace('\n','').replace('\xa0','')\
                .replace('"','“').replace("'","‘")
            sql='update hospital set description="'+des+'" where name="'+hospital_list[i]+'"'
            print('剩余'+str(len(urls)-i)+':'+urls[i])
            cursor.execute(sql)
            db.commit()
        else:
            print('未收录:'+urls[i])
            continue
コード例 #40
0
def scrape_search_results(areas):
    '''
    scrapes search page, collects information about the cars available for sale.
    '''
    results = []  
    search_indices = np.arange(0, 300, 100)
    for area in areas:
        print area
        for i in search_indices:
            url = 'http://sfbay.craigslist.org/search/{0}/cta'.format(area)
            resp = requests.get(url, params={'hasPic': 1, 's': i})
            txt = bs4(resp.text, 'html.parser')
            cars = txt.findAll(attrs={'class': "row"})
            tags=txt.findAll('img')
            img_tags = "\n".join(set(tag['src'] for tag in tags))
            title = [rw.find('a', attrs={'class': 'hdrlnk'}).text
                          for rw in cars]
            links_raw = [rw.find('a', attrs={'class': 'hdrlnk'})['href']
                     for rw in cars]
            links = ['http://sfbay.craigslist.org'+car_link for car_link in links_raw]
            # find the time and the price
            time = [pd.to_datetime(rw.find('time')['datetime']) for rw in cars]
            price = find_prices(cars)

            # create a dataframe to store all the data
            data = np.array([time, price, title, links])
            col_names = ['time', 'price', 'title', 'link']
            df = pd.DataFrame(data.T, columns=col_names)

            # add the location variable to all entries
            df['loc'] = area
            results.append(df)

    # concatenate all the search results
    results = pd.concat(results, axis=0)
    return results
コード例 #41
0
def main(prefix, docnum):
    #获取提单号
    # prefix= '784'
    # docnum = '27400365'
    open_text()
    CZ._data['ctl00$ContentPlaceHolder1$txtPrefix'] = prefix
    CZ._data['ctl00$ContentPlaceHolder1$txtNo'] = docnum
    r = requests.post(CZ.url + CZ.prefix + prefix + CZ.awbno + docnum +
                      CZ.menuid + CZ.lang,
                      data=CZ._data)
    content = bs4(r.content, 'html.parser')
    table = content.find_all('table')
    status = list(table[2].stripped_strings)
    flight = []
    for i in range(len(status)):
        if re.search(r'[\dA-Z]{4,10}', status[i]) and re.search(
                r'[-\d]{8,10}', status[i + 1]):
            d = {
                'air_code': '%s' % status[i],
                '_dep_port': '%s' % status[i - 2],
                '_dest_port': '%s' % status[i - 1],
                'airline_comp': 'CZ',
            }
            flight.append(d)

    status = list(table[4].stripped_strings)
    j = 0
    for i in range(len(status)):
        if 'Cargo has been loaded' in status[i]:
            flight[j].update({'_atd': '%s' % status[i - 3]})
            flight[j].update({'_std': '%s' % status[i - 3]})
        if 'Flight has arrived' in status[i]:
            flight[j].update({'_ata': '%s' % status[i - 3]})
            flight[j].update({'_sta': '%s' % status[i - 3]})
            j += 1
    return flight
コード例 #42
0
def get_lyrics(artist, song):

    if not isinstance(artist, str):
        raise TypeError("The artist name should be a string")
    if not isinstance(song, str):
        raise TypeError("The song name should be a string")

    artist_name, song_name = _clean_names(artist, song)
    # print(artist_name, song_name)
    url = _create_url(artist_name, song_name)

    try:
        page = _get_page(url)
    except ValueError:
        return []

    soup = bs4(page, "html.parser")
    mydivs = soup.find("div", {"class": "ringtone"})
    lyrics = mydivs.find_next_sibling("div")

    # Use the .stripped_strings generator to remove all extra whitespace
    # and strings consisting only of whitespace
    lyric_list = [text for text in lyrics.stripped_strings]
    return lyric_list
コード例 #43
0
def scraper(pgs1, pgs2, srt):
    pages = []
    prices = []
    stars = []
    titles = []

    data = {'Title': titles, 'Price': prices, 'Rating': stars}

    for numPgs in range(int(pgs1), int(pgs2) + 1):
        url = (
            'http://books.toscrape.com/catalogue/category/books_1/page-{}.html'
            .format(numPgs))
        pages.append(url)
    for item in pages:
        page = requests.get(item)
        soup = bs4(page.text, 'html.parser')
        for iterA in soup.findAll('h3'):
            ttl = iterA.getText()
            titles.append(ttl)
        for iterB in soup.findAll('p', class_='price_color'):
            price = iterB.getText()
            prices.append(price)
        for iterC in soup.findAll('p', class_='star-rating'):
            for key, value in iterC.attrs.items():
                star = value[1]
                stars.append(star)

    if (srt == "title"):
        titles.sort()
    elif (srt == "price"):
        prices.sort()
    elif (srt == "rating"):
        stars.sort()

    df = pd.DataFrame(data=data)
    return df
コード例 #44
0
ファイル: snai.py プロジェクト: simonemastella/SureBetFinder
def scrap(link):
    try:
        session = HTMLSession()
        with session.get(link) as res:
            res.html.render() 
            soup = bs4(res.html.html, 'html5lib')
            tags= soup.findAll("table",{"class":"table table-bordered table-condensed table-striped table-hover margin-bottom-10 ng-scope"})
            risultato=pd.DataFrame(columns=['tipo','casoF','casoV'])
            esatto=tags[0].findAll("span",{"class":"ng-binding ng-scope"})
            doppia=tags[1].findAll("span",{"class":"ng-binding ng-scope"})
            new_row = {'tipo':"1X-2", 'casoF':doppia[0].getText().strip(), 'casoV':esatto[2].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"12-X", 'casoF':doppia[2].getText().strip(), 'casoV':esatto[1].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            new_row = {'tipo':"2X-1", 'casoF':doppia[1].getText().strip(), 'casoV':esatto[0].getText().strip()}
            risultato = risultato.append(new_row, ignore_index=True)
            tipo=tags[2].find("div",{"class":"pull-left ng-binding"}).getText().strip()
            t=tags[2].findAll("span",{"class":"ng-binding ng-scope"})
            quotaF=t[1].getText().strip()
            quotaV=t[0].getText().strip()
            new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV}
            if "GOL" in tipo:
                risultato = risultato.append(new_row, ignore_index=True)
            for tag in tags[3:12]:
                tipo=tag.find("div",{"class":"pull-left ng-binding"}).getText().strip()
                t=tag.findAll("span",{"class":"ng-binding ng-scope"})
                quotaF=t[0].getText().strip()
                quotaV=t[1].getText().strip()
                new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV}
                if "UNDER" in tipo:
                    risultato = risultato.append(new_row, ignore_index=True)
            return risultato
    except:
        print("Errore nella ricerca DATI su SNAI, cerco di nuovo")
        session.close()
        return scrap(link)
コード例 #45
0
 def __detect_virtual_pairs(self):
     """Auto-detect virtual pairs by their record file header."""
     virtual_pairs = []
     # RegEx for matching pair number and names in pair record header
     pair_header_match = re.compile('([0-9]{1,}): (.*) - (.*), .*')
     for record_file_path in self.__pair_records_files:
         log.getLogger('detect').debug('examining record file %s',
                                       record_file_path)
         with file(record_file_path) as record_file:
             record = bs4(record_file, 'lxml')
         # first <td class="o1"> with content matching
         # pair header is what we're after
         header = [con for con
                   in record.select('td.o1')[0].contents
                   if isinstance(con, NavigableString) and re.search(
                       pair_header_match, con)]
         log.getLogger('detect').debug('detected header: %s', header)
         if len(header):
             header_match = re.match(pair_header_match, header[0])
             pair_number = int(header_match.group(1))
             names = [name for name in [header_match.group(2).strip(),
                                        header_match.group(3).strip()]
                      if len(name)]
             log.getLogger('detect').debug('parsed header: %d, %s',
                                           pair_number, names)
             # virtual pair does not have any names filled
             if len(names) == 0:
                 virtual_pairs.append(pair_number)
     if len(virtual_pairs) == 0:
         log.getLogger('detect').warning('No virtual pairs detected')
     else:
         log.getLogger('detect').info('virtual pairs: %s',
                                      ' '.join(sorted(
                                          [str(pair) for pair
                                           in virtual_pairs])))
     return sorted(virtual_pairs)
コード例 #46
0
def fill_pair_list_table(cells, row_cell_count=20):
    """Format cell list into well-formed rows, aligned by column count."""
    content = bs4('<table />', 'lxml')
    content.append(content.new_tag('table'))
    # first filler cell of each new row
    first_cell = content.new_tag('td', **{'class': 'n'})
    first_cell.string = u'\xa0'
    # arrange cells into rows, full rows first
    while len(cells) >= row_cell_count:
        new_row = content.new_tag('tr')
        new_row.append(copy.copy(first_cell))
        for cell in cells[0:row_cell_count]:
            new_row.append(cell)
        content.table.append(new_row)
        log.getLogger('rec_list').debug('aligning cells %s to %s in a row',
                                        cells[0].a.contents,
                                        cells[row_cell_count-1].a.contents)
        del cells[0:row_cell_count]
    # last row may or may not be full
    last_row = content.new_tag('tr')
    last_row.append(copy.copy(first_cell))
    for cell in cells:
        last_row.append(cell)
    log.getLogger('rec_list').debug('leaving cells %s to %s in last row',
                                    cells[0].a.contents,
                                    cells[-1].a.contents)
    # if it wasn't full, fill it with a col-spanned last cell
    if len(cells) < row_cell_count:
        last_cell = content.new_tag('td',
                                    colspan=row_cell_count-len(cells))
        last_cell.string = u'\xa0'
        last_row.append(last_cell)
        log.getLogger('rec_list').debug('filling last row with: %s',
                                        last_cell)
    content.table.append(last_row)
    return content.table.contents[:]
コード例 #47
0
def videoDetail(videoUrl):
    try:
        itemPage = bs4(req.get(videoUrl).text, 'html.parser')
    except expression as e:
        # print(e)
        print('=====> request failed/ check network connection!')
    choices = [
        i['aria-label'].split(' ')[-1]
        for i in itemPage.select('.menu-list .link a')
    ]
    downloadLinks = {}
    for itemLink in itemPage.select('.menu-list .link a'):
        downloadLinks[itemLink['aria-label'].split(' ')[-1]] = itemLink['href']
    questions = [
        inq.List(
            'quality',
            message="\U0001F914  Select quality",
            choices=choices,
        ),
    ]
    answer = inq.prompt(questions)
    itemPageDownloadLink = downloadLinks[answer['quality']]
    itemTitle = f"{itemPage.select('#videoTitle')[0].text}-{answer['quality']}"
    return itemTitle, itemPageDownloadLink
コード例 #48
0
    def inbox(self, page=1):
        """Fetch a list of messages from the user's inbox
        Incidentally update the number of messages"""
        soup = bs4(
            session.base_get('inbox.php', params={
                'page': page
            }).text, "html.parser")

        self.new_messages = self.__parse_new_messages(soup)

        for row in soup.find(id="messageformtable").tbody.find_all('tr'):
            yield {
                'Subject':
                row.find_all('td')[1].text.encode('UTF-8').strip(),
                'Sender':
                row.find_all('td')[2].text,
                'Date':
                row.find_all('td')[3].span['title'],
                'ID':
                re.search(r'id=(\d+)',
                          row.find_all('td')[1].a['href']).group(1),
                'Unread':
                True if 'inbox-message--unread' in row['class'] else False
            }
コード例 #49
0
ファイル: stock.py プロジェクト: imtkusd20172018/InsurMan
def getStock(code, query):
    reply = ''
    stock = [[
        'Time', 'Market Price', 'Buy', 'Sell', 'Rise&Fall', 'Volume',
        'Previous Close', 'Open', 'High', 'Low'
    ]]
    driver = webdriver.PhantomJS(
        executable_path=
        'D:\\Anaconda3.6\\Scripts\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe'
    )
    x = code
    r = driver.get('https://tw.stock.yahoo.com/q/q?s=' + x)
    pageSource = driver.page_source
    soup = bs4(pageSource, 'html.parser')
    tables = soup.find_all('table')
    row = tables[5].find_all('td')[0].findAll('tr')[1].findAll('td')[1:-1]
    for item in row:
        row[row.index(item)] = item.text.strip()
    if query == '市價':
        reply = row[1]
    elif query == '買價':
        reply = row[2]
    elif query == '賣價':
        reply = row[3]
    elif query == '成交量':
        reply = row[5]
    elif query == '前一天收盤價':
        reply = row[6]
    elif '開盤' in query:
        reply = row[7]
    elif '最高' in query:
        reply = row[8]
    elif '買低' in query:
        reply = row[9]

    return (code + ' 的' + query + '是 ' + reply + 'ㄛ~~~~<3')
コード例 #50
0
ファイル: parse_tzcs.py プロジェクト: cinit/Xinshou
def parse(html):
    res = {
        '身高测量': -1,
        '体重测量': -1,
        '肺活量': -1,
        '50米跑': -1,
        '立定跳远': -1,
        '1000米跑': -1,
        '800米跑': -1,
        '坐体前屈': -1,
        '仰卧起坐': -1,
        '引体向上': -1,
        '左眼视力': -1,
        '右眼视力': -1
    }
    html = bs4(html, 'html.parser')
    tr = html.find_all('tr')
    for i in tr:
        td = i.find_all('td')
        if not td:
            continue
        if td[0].text in res:
            res[td[0].text] = td[1].text
    return res
コード例 #51
0
    def get_followers(self):

        time.sleep(2)
        #open followers link of followers on any account
        # right click followers link on the page then right click on the selected script and copy selector
        flw_btn = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                '#react-root > section > main > div > header > section > ul > li:nth-child(2) > a'
            )))
        flw_btn.click()
        # When the pop up of followers appears webpage changes
        # so we click on follower button, then pop up appears then we click inspect , then we check the divison which get highlighted when we scroll down, copy that CSS
        popup = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP')))
        for i in range(0, 10):
            time.sleep(1)
            self.driver.e
            xecute_script(
                'arguments[0].scrollTop = arguments[0].scrollHeight', popup
            )  #excecuting javascripts, this tells us scroll down to the bottom of the followers list. So the scroller is at the bottom
        popup = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP')))
        print('cool')
        b_popup = bs4(popup.get_attribute('innerHTML'), 'html.parser')
        for p in b_popup.findAll(
                'li',
            {'class': 'woI9H'
             }):  # findAll means find all the attributes under that tag
            print(
                p.findAll('a')[0]['href']
            )  # So li tags means all the followers and 'a' tag is in that li tag to and can be used to find href of the li or followers to get to their link by savuing all href in a li
            print('Awesome')
        print('end')
コード例 #52
0
ファイル: WorldCDC.py プロジェクト: KKodiac/Covid19_Stats
class WorldCovid:
    def __init__(self):
        self.today = ctime(time())
        self.scrape_url = "https://www.worldometers.info/coronavirus/"
        self.datafile = f"./data/World/new_covid_dat.csv"
        self.appendfile = f"./data/World/world_timeseries/{self.today[4:10]+self.today[-5:]}.csv"
        self.is_updated = path.isfile(self.appendfile)


    def getData(self) -> list, list:
        page = requests.get(self.scrape_url)
        html = bs4(page.text, 'html.parser')
        table = html.find(id="main_table_countries_today")

        thead_all = table.thead.find_all('th')

        thead = [th.text for th in thead_all]

        tbody_all = table.find_all('tbody')
        tr_temp = [tr for tr in tbody_all[0].find_all('tr')]
        td_temp = [td.find_all('td') for td in tr_temp]
        tbody = [[j.text.strip() for j in i] for i in td_temp]

        return thead, tbody
コード例 #53
0
def getblog(value):
    try:
        #getting all the values
        if (value == 'all'):
            x = requests.get('https://www.freecodecamp.org/news/').text
        else:
            # getting the response based on particular tag
            x = requests.get(f"https://www.freecodecamp.org/news/tag/{value}").text
        soup=bs4(x,'lxml')
        hack = soup.find_all('article',class_ = 'post-card')
        #intitializing a dictionary
        val={}
        val["dic"]=[]
        for i in range(0,len(hack)):
            data={}
            data["Tag"] = hack[i].find('span',class_='post-card-tags').text.strip(' \t\n\r')
            data["Blog-Title"] = hack[i].find('h2',class_='post-card-title').text.strip(' \t\n\r')
            data["Blog-link"] = hack[i].find('a',class_='post-card-image-link').get('href')
            data["Blog-link"]="https://www.freecodecamp.org"+data["Blog-link"]
            data["Author"] = hack[i].find('a',class_='meta-item').text.strip(' \t\n\r')
            val["dic"].append(data)
        return val
    except Exception as e:
        return {"status":False,"error":e}
コード例 #54
0
    def main(self):
        params = dict(pets_cat=1, max_price=2000)
        rsp = requests.get(self.url, params=params)
        html = bs4(rsp.text, 'html.parser')
        apts = html.find_all('p', attrs={'class': 'row'})
        for apt in apts:
            #    print apt.prettify()
            size = apt.findAll(attrs={'class': 'housing'})[0].text
            sqft, beds = self.find_size_and_bdrms(size)
            self.apartment['sqft'] = sqft
            self.apartment['beds'] = beds
            self.apartment['updated_datetime'] = apt.find('time')['datetime']
            self.apartment['price'] = float(apt.find('span', {'class': 'price'}).text.strip('$'))
            self.apartment['title'] = apt.find('a', attrs={'class': 'hdrlnk'}).text
            self.apartment['url'] = 'h'+self.url.strip('/search/apa') + apt.find('a', attrs={'class': 'hdrlnk'})['href']
            info = self.get_more_info(self.apartment['url'])
            

            for k,v in self.apartment.iteritems():
                print k,v
            print '\n'

            exit()
            time.sleep(1)
コード例 #55
0
        song_list.append(song)
        i += 2
    for i in range(len(song_list)):
        print(str(i + 1) + ': ' + song_list[i]['name'])
    choice = int(input('Enter the song number to download: '))
    # todo: add choice to download entire album also
    # todo: check for invalid input
    choice -= 1
    return song_list[choice]['url'], song_list[choice]['name']


if __name__ == '__main__':
    url = 'https://www.starmusiq.fun/search/search-for-blocked-movies-starmusiq.html'
    query = input('Enter album name: ')
    search_result = requests.get(url, params={'query': query})
    soup = bs4(search_result.text, 'html.parser')
    albums_container = soup.find("div", {"id": "search_albums"})
    search_result_links = albums_container.findAll(
        'a', {'class': 'label label-danger'})
    choice = display_results(search_result_links)
    while choice[0] != 1 and choice[0] != 2:
        choice = display_results(search_result_links)
    if choice[0] == 1:
        url = choice[1]
        album_page = requests.get(url)
        soup = bs4(album_page.text, 'html.parser')
        links = soup.findAll(
            'a', {
                'style':
                'background:#cb413f;color:#fff;line-height:39px;padding:8px 6px;text-decoration:'
                + 'none;border-radius:.25em; font-weight:700;'
コード例 #56
0
#pp = pprint.PrettyPrinter(indent=4).pprint

chrome_options = Options()
#chrome_options.add_extension("proxy.zip")
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(executable_path='chromedriver.exe',
                          chrome_options=chrome_options)

url = 'https://xxxxxx'
#url2 = 'https://xxxxxx'
url3 = 'https://xxxxxx/Extracted/'

driver.get(url)
driver.get(url3)
page = driver.page_source
soup = bs4(page, 'html.parser')

parsesoup = re.findall(
    r"(.*href)*=\"(.*)(mkv).*(.mkv\")(.*)(\d{4}-[a-zA-Z]{3}-\d{2} \d{2}:\d{2})",
    page)

tables = soup.findChildren('table')
tables1 = tables[0]
rows = tables1.findChildren(['tr'])
dates = []

for row in rows:
    cells = row.findChildren('td')
    for cell in cells:
        value = cell.string
        dates.append(value)
コード例 #57
0
def addnewtemplate():
    def templatelookup():
        business = str(session['business'])
        con = sqlite.connect('db/db1.db')
        with con:
            cur = con.cursor()
            cur.execute('PRAGMA key = ' + dbkey + ';')
            templatelist = []
            for row in cur.execute(
                    'select name from templates where business = (?) or shared = 1;',
                (business, )):
                templatelist.append(row[:][0])
            con.close
        return templatelist

    Path("./templates/businesses/" + str(session['business'])).mkdir(
        parents=True, exist_ok=True)

    if request.method == "POST":
        print(request.form.get('templateview'))
        if str(request.form.get('templateview')) != 'None':
            searchtemplates = templatelookup()
            print(searchtemplates)
            print(request.form.get('templateview'))
            templateview = request.form.get('templateview')
            if templateview == 'prototype2':
                templateview = '/templates/prototype2.html'
                searchtemplates = templatelookup()
                return render_template('addtemplate.html',
                                       searchtemplates=searchtemplates,
                                       templateview=templateview)
            else:
                templatecustom = 'businesses+^+' + session[
                    'business'] + '+^+' + templateview + '.html'
                searchtemplates = templatelookup()
                return render_template('addtemplate.html',
                                       searchtemplates=searchtemplates,
                                       templatecustom=templatecustom)

        if request.form.get('editordata') != None:
            try:
                savehtml = request.form.get('editordata')
                soup = bs4(savehtml)
                for a in soup.findAll('a'):
                    a['href'] = "replacelink"
                    a['data-saferedirecturl'] = 'replacelink'
                savehtml = str(soup)
                savehtmlnam = str(request.form.get('templatename'))
                savehtmlnam = savehtmlnam.replace(' ', '_')
                savehtmlname = savehtmlnam + '.html'
                templatesubject = request.form.get('templatesubject')
                if os.path.isfile('./templates/businesses/' +
                                  session['business'] + '/' + savehtmlname):
                    flash('A template with this name already exists',
                          'category2')
                    return render_template("addtemplate.html",
                                           searchtemplates=searchtemplates)
                else:
                    with open(
                            './templates/businesses/' + session['business'] +
                            '/' + savehtmlname, 'w') as f:
                        f.write(savehtml)
                    con = sqlite.connect('db/db1.db')
                    with con:
                        cur = con.cursor()
                        cur.execute('PRAGMA key = ' + dbkey + ';')
                        cur.execute(
                            'insert into templates (business, name, emailsubject) VALUES (?,?,?);',
                            (session['business'], savehtmlnam,
                             templatesubject))
                        con.commit
                    con.close
                    flash('Submitted!', 'category2')
                    return render_template("addtemplate.html",
                                           searchtemplates=searchtemplates)
            except:
                searchtemplates = templatelookup()

        if request.form.get('selecttemplate') != 'Templates':
            if request.form.get('selecttemplate') != None:
                selecttemplate = request.form.get('selecttemplate')
                if selecttemplate == 'prototype2':
                    flash('No deleting default templates', 'category2')
                else:
                    con = sqlite.connect('db/db1.db')
                    with con:
                        cur = con.cursor()
                        cur.execute('PRAGMA key = ' + dbkey + ';')
                        cur.execute(
                            'delete from templates where business LIKE (?) and name LIKE (?);',
                            (
                                session['business'],
                                selecttemplate,
                            ))
                    con.close()
                    os.remove('./templates/businesses/' + session['business'] +
                              '/' + selecttemplate + '.html')
                    flash('Deleted!', 'category2')

    searchtemplates = templatelookup()
    print(searchtemplates)

    return render_template("addtemplate.html", searchtemplates=searchtemplates)
コード例 #58
0
def download_one(xkcd_dict, xkcd_num):
    if not xkcd_dict:
        return None

    xkcd_number = str(xkcd_num)
    if xkcd_number in excludeList:
        downloadImage = False
        print('{num} is special. It does not have an image.'.format(
            num=xkcd_number))
        '''
        [2] Some comics are special and either don't have an image or have a dynamic one.
            The full list is the array excludeList and needs to be manually updated upon the release
            of such a comic.
        '''
    else:
        downloadImage = True
    if xkcd_number in xkcd_dict:
        date = xkcd_dict[xkcd_number]['date-published']
        description = xkcd_dict[xkcd_number]['description']

        new_description = sanitize_description(description)

        new_folder = '{current_directory}/xkcd_archive/{name}'.format(
            current_directory=WORKING_DIRECTORY, name=xkcd_number)

        to_download_single = "{base}/{xkcd_num}/".format(base=BASE_URL,
                                                         xkcd_num=xkcd_number)
        print(
            "Downloading xkcd from '{img_url}' and storing it under '{path}'".
            format(img_url=to_download_single, path=new_folder))
        alt = requests.get(to_download_single + 'info.0.json').json()['alt']
        if os.path.exists(new_folder):
            print("xkcd  number '{num}' has already been downloaded!".format(
                num=xkcd_number))
        else:
            os.makedirs(new_folder)
            os.chdir(new_folder)
            with open('description.txt', 'w') as f:
                content = """title : {description}
date-published: {date}
url: {url}
alt: {altText} \n""".format(description=description,
                            date=date,
                            url=to_download_single,
                            altText=alt)
                f.write(content)

            image_page = requests.get(to_download_single, stream=True)
            if downloadImage:
                if image_page.status_code == 200:
                    image_page_content = image_page.content
                    image_page_content_soup = bs4(image_page_content,
                                                  'html.parser')

                    for data in image_page_content_soup.find_all(
                            "div", {"id": "comic"}):
                        for img_tag in data.find_all('img'):
                            img_link = img_tag.get('src')

                    complete_img_url = "http:{url}".format(url=img_link)

                    file_name = "{description}.jpg".format(
                        description=new_description)
                    r = requests.get(complete_img_url, stream=True)
                    if r.status_code == 200:
                        with open(file_name, 'wb') as f:
                            r.raw.decode_content = True
                            shutil.copyfileobj(r.raw, f)
                    else:
                        print("Error with connectivity. HTTP error {}".format(
                            r.status_code))
                    magic_response = str(magic.from_file(file_name, mime=True))
                    if 'png' in magic_response:
                        os.rename(
                            file_name, "{description}.png".format(
                                description=new_description))
                    elif 'jpeg' in magic_response:
                        os.rename(
                            file_name, "{description}.jpeg".format(
                                description=new_description))
                    elif 'gif' in magic_response:
                        os.rename(
                            file_name, "{description}.gif".format(
                                description=new_description))

    else:
        print("{} does not exist! Please try with a different option".format(
            xkcd_number))
コード例 #59
0
def truePeopleSearch():
    
    print("\n STEP 2: Webscraping")
    
    # create url variable of web address
    url='https://www.truepeoplesearch.com/'
    
    # prepare df for webscrape
    
    print('\n Starting truepeoplesearch.com')
    
    #zone = [', AL',', AR',', GA',', LA',', MS',', NC',', TN',', WV']
    
    # could possibly add an area that would emcumpass possible resutls
    #area = ['AL','AR','GA','LA','MS','NC','TN','WV']
    
    # define dedup function to dedup lists
    def dedup(seq):
        """
        removes duplicate values from a list or 
        duplicate characters from a string 
        """
        if type(seq) == list:
            seen = set()
            seen_add = seen.add
            return [x for x in seq if not (x in seen or seen_add(x))]    
        elif type(seq) == str:
            seen = set()
            seen_add = seen.add
            return ''.join([x for x in seq if not (x in seen or seen_add(x))])    
        else:
            print("Currently function can only handle lists and strings")
    
    # create empty dictionary and list  
    # ps, hardest webscraped i have ever done. the captch problem makes it even worse
    fullDict = {}
    nameList = []
    for key, value in beneDict.items(): 
        nameDict = {}
        counter = -1
        if value[1] not in nameList:
            print('Searching TruePeopleSearch for %s'%  value[1])
            nameList.append(value[1])
            splitName = '%20'.join(value[1].split())
        #    for k, kvalue in enumerate(replaceList)
            cleanName = value[1]
            first = value
        #    splitName = 'john%20smith'
    #        print(splitName)
            flag = 'Y'
            pageCount = 0
            while flag == 'Y':
    #            print(flag)
                pageCount +=1
                tempURL = url+'results?name='+splitName+'&citystatezip=%s&page=%s' % (value[2], pageCount)
                pageContent = requests.get(tempURL).content
        #        time.sleep(1)
    #            print(tempURL)
                if 'captchasubmit?returnUrl' in str(pageContent): 
    #                print('first')    
                    while 'captchasubmit?returnUrl' in str(pageContent):
                    
                        print('captcha found')
                        def afterCaptcha():
                            global captcha, pageContent
                            pageContent = requests.get(tempURL).content
                            print("Program will continue until another captcha is requested")
                            captcha.destroy()
                            
                        def quitTkinterBox():
                            global captcha
                            captcha.destroy()
                            print("raise SystemExit")
                        
                        captcha = Tk()
                        captcha.lift()
                        
                        captcha.attributes('-topmost',True)
                        
                        captcha.after_idle(captcha.attributes,'-topmost',False)
        
    #                    messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n  solve the captcha")
                        
                        Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W)
                        
                        Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1)
                        
                        Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4)
                    
                        captcha.mainloop()  
                        
                    print("Continuing")
                    pageContent = requests.get(tempURL).content
        #                time.sleep(1)
                    soup = bs4(pageContent, "html.parser")
    #                print(soup)
                    linkList = []
                    diffList = []
                    if str(soup).find('btnNextPage') == -1:
                        flag = 'N'
                    for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}):
                        if str(value[5]) in card.text: 
                            for h4 in card.find_all(attrs= {'class':'h4'}):
    #                            print(value[1])
    #                            print(h4.text.strip().upper())
                                result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio()
    #                            print(result)
                            for a in card.find_all('a'):
                                if 'name' in a['href']:
                                    if 'page' not in a['href']:
                                        if a['href'] not in linkList:
        #                                    if result > .5:
                                                diffList.append(result)
                                                linkList.append(a['href'])
                        else:
                            None
                        
                            
                else:
                    soup = bs4(pageContent, "html.parser")
    #                print(soup)
                    linkList = []
                    diffList = []
                    if str(soup).find('btnNextPage') == -1:
                        flag = 'N'
                    for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}):
                        if str(value[5]) in card.text: 
                            for h4 in card.find_all(attrs= {'class':'h4'}):
    #                            print(value[1])
    #                            print(h4.text.strip().upper())
                                result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio()
    #                            print(result)
                            for a in card.find_all('a'):
                                if 'name' in a['href']:
                                    if 'page' not in a['href']:
                                        if a['href'] not in linkList:
        #                                    if result > .5:
                                                diffList.append(result)
                                                linkList.append(a['href'])
                        else:
                            None
        
                for i, ivalue in enumerate(linkList):
                    counter += 1
                    infoDict = {}
                    tempURL = url+linkList[i]
                    pageContent2 = requests.get(tempURL).content
        #            time.sleep(1)
                    if 'captchasubmit?returnUrl' in str(pageContent2): 
    #                    print('first')
                        while 'captchasubmit?returnUrl' in str(pageContent2):
                        
                            print('captcha found')
                            def afterCaptcha():
                                global captcha, pageContent2
                                pageContent2 = requests.get(tempURL).content
        #                        time.sleep(1)
                #                print("Failed to solve captcha. Ending program. Please try again.")
                                print("Program will continue until another captcha is requested")
                                captcha.destroy()
                                
                            def quitTkinterBox():
                                global captcha
                                captcha.destroy()
                                print("raise SystemExit")
                            
                            captcha = Tk()
                            captcha.lift()
                            
                            captcha.attributes('-topmost',True)
                            
                            captcha.after_idle(captcha.attributes,'-topmost',False)
            
    #                        messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n  solve the captcha")
                        
                            Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W)
                        
                            
                            Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1)
                            
                        #    Button(master, text='Run', command=importNums).grid(row=4, column=1, sticky=W, pady=1)
                            Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4)
                        
                            captcha.mainloop()  
                            
                        print("Continuing")
                        soup = bs4(pageContent2, "html.parser")
                        phoneList = []
    #                    print(ivalue)
                        infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip()
                        infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip()
                        infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip()
                        infoDict['match'] = value[0]
                        infoDict['origFullName'] = value[1]
                        for a in soup.find_all('a'):
                            if 'phoneno' in a['href']:
                                phone = a['href'][a['href'].find('=')+1:]
                                if phone not in phoneList:
                                    phoneList.append(phone)
                        infoDict['phone'] = phoneList
                        infoDict['source'] = 'TPS'
                        infoDict['diff'] = diffList[i]
    
        
                    else:
                        
                        soup = bs4(pageContent2, "html.parser")
                        phoneList = []
    #                    print(ivalue)
                        infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip()
                        infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip()
                        infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip()
                        infoDict['match'] = value[0]
                        infoDict['origFullName'] = value[1]
    
                        for a in soup.find_all('a'):
                            if 'phoneno' in a['href']:
                                phone = a['href'][a['href'].find('=')+1:]
                                if phone not in phoneList:
                                    phoneList.append(phone)
                        infoDict['phone'] = phoneList
                        infoDict['source'] = 'TPS'
                        infoDict['diff'] = diffList[i]
                    nameDict[counter] = infoDict
                fullDict[value[0]] = nameDict 
        else:
            None
    return fullDict
コード例 #60
0
 driver = webdriver.PhantomJS(executable_path=r'S:\DA_work_files\DA_Work_Python\phantomjs-2.1.1-windows\bin\phantomjs.exe')
 driver.set_window_size(1124, 850) # set browser size.
 # use driver to get url
 driver.get(url)
 
 #Find the search box and input the name
 nameInput = driver.find_element_by_id('fullName')
 nameInput.send_keys(name)
 
 # click on search button
 submit = driver.find_element_by_css_selector('button.btn.btn-lg.btn-block').click()
 
  
 #get current page
 page_content = requests.get(driver.current_url).content
 soup = bs4(page_content, "html.parser")
 
 
 try:
     #finding the number or total results
     for line in soup.find_all("span",class_="ThatsThem-results-preheader"):
         results1 = ''.join(line.find_all(text=True))
         
     results=results1[7:10]
     results=int(results.strip())
     
     # Grab name, Address, Phone number, Email for each result
     name=[]
     address1=[]
     address2=[]
     address3=[]