def get_spider(self): spider_map = { 1: KeKeSpider, # CNN, 2: FoxSpider, # 福克斯 3: KeKeArticleSpider, # Article 4: EN24Spider # 24en } response = requests.get(url=self.url, headers=HEADERS) page_type = -1 if response.status_code == 200: html = response.content.decode("utf8") title = BeautifulSoup(html, 'lxml').find("title") title = str(title) if title.count("福克斯") and title.count("新闻"): page_type = 2 elif title.count("CNN") or title.count("cnn"): page_type = 1 elif self.url.count("Article"): page_type = 3 elif title.count("NPR") or title.count("npr"): page_type = 1 elif self.url.startswith("https://www.24en.com/voa"): page_type = 4 print(page_type) return spider_map.get(page_type) else: return None
def countKeywords(page, keywords): counts = list() counts = [0] * len(keywords) bodyText = BeautifulSoup(page).get_text() bodyText = re.sub('[.!?/(),;\'\"]', '', bodyText) wordsInBody = bodyText.split() for keyword in keywords: if ' ' in keyword: counts[keywords.index(keyword)] = bodyText.count(keyword) else: counts[keywords.index(keyword)] = wordsInBody.count(keyword) return counts
def parse_ria_json(path): with open(path, "r", encoding="utf-8") as r: # ria2020 parsing os commented # pat = '{\"text\": \"(.*)\", \"title\": \"(.*)\"}' for line in r: data = json.loads(line.strip()) # data = re.search(pat, line.strip()) # title = data.group(2).lower().strip() # clean_text = data.group(1).lower().replace('\xa0', ' ').replace('\n', ' ').strip() title = data["title"] text = data["text"] clean_text = BeautifulSoup(text, 'html.parser').text.replace( '\xa0', ' ').replace('\n', ' ') if not clean_text or not title or clean_text.count( ' ') < 3 or title.count(' ') < 3: continue yield clean_text, title
def states(): state_select = self.get_state_select() state_select_option_values = [ '%s' % o.get_attribute('value') for o in state_select.options[1:] ] for v in state_select_option_values: state_select = self.select_state_option(v) self.driver.page_source text=BeautifulSoup(self.driver.page_source, "html.parser").get_text() meta_prices=[] for keyword in keywords: prices = [] counter=text.count(keyword) for z in range(counter): prices.append(text.rsplit(keyword, z+1)[1].splitlines()[0]) prices=[float(price) for price in prices] meta_prices.append(prices) yield (state_select.first_selected_option.text,meta_prices)
def nested(ScrappedPageStruct): content_bs = BeautifulSoup(ScrappedPageStruct.content, 'html.parser').prettify() return content_bs.count(column)
def process_text(text): mentions = text.count('@') hashtags = text.count('#') urls = len(find_urls(text)) # Remove links text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split()) # Remove mentions text = ' '.join( re.sub("(@[A-Za-z0-9^\w]+)", " ", text.replace('@ ', '@').replace('# ', '#')).split()) # Replace hashtags with words if text.count('#') > 0: text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' '))) # Remove HTML tags text = BeautifulSoup(text).get_text() # Save content length (exluding links and mentions) length = len(text) # Remove punctuation symbols text = ' '.join( re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ", text).split()) text = text.translate(remove_digits).translate(remove_punctuation) # Lower case to avoid case sensitive problems text = text.lower() # Replace emojis with names text = emoji.demojize(text) # Add space between emojis and other characters ind = -2 for c in range(text.count(':')): ind = text.find(':', ind + 2) if c % 2 == 0: newLetter = ' :' else: newLetter = ': ' text = "".join((text[:ind], newLetter, text[ind + 1:])) # Replace emoji names with spanish meaning result = [] parts = text.split(' ') for part in parts: if part: if part[0] == ':': em = handle_emoji_tone(part) em = emoji_meaning(em) if em: result.append(em) else: result.append(part) text = ' '.join(result) # Filter using NLTK library append it to a string word_tokens = word_tokenize(text) result = [w for w in word_tokens if not w in stop_words] text = ' '.join(result) # Check if text contains at least a word analysis = TextBlob(text) try: # Sentiment analysis eng = analysis.translate(to='en') sentiment = eng.sentiment polarity = sentiment.polarity subjectivity = sentiment.subjectivity except Exception as e: polarity = 0.0 subjectivity = 0.0 result = { 'no_hashtags': [hashtags], 'no_mentions': [mentions], 'no_urls': [urls], 'effective_length': [length], 'polarity': [polarity], 'subjectivity': [subjectivity] } return result
return start for i in range(1, 53): pagina = i url = "http://www.zimmo.be/nl/panden/?status=1&type%5B0%5D=5&hash=86c443f7d824f3e29fb8377485e15a2c&priceIncludeUnknown=1&priceChangedOnly=0&bedroomsIncludeUnknown=1&bathroomsIncludeUnknown=1&constructionIncludeUnknown=1&livingAreaIncludeUnknown=1&landAreaIncludeUnknown=1&commercialAreaIncludeUnknown=1&yearOfConstructionIncludeUnknown=1&epcIncludeUnknown=1&queryCondition=and&includeNoPhotos=1&includeNoAddress=0&onlyRecent=0&onlyRecentlyUpdated=0&isPlus=0®ion=list&district=MzAYBcMGGKICbOIgHkIcVTcVHAAA&pagina={}#gallery".format( pagina) url1 = url request1 = requests.get(url1) #request2 = requests.get(url2, cookies = request1.cookies) table = BeautifulSoup(request1.text, "html.parser") table = str(table) #print(table) print(table.count("id=\"pand-")) stop = table.count("id=\"pand-") for x in range(1, stop): #print(table) #rows = table.findAll('class')[2::3] eerste = find_nth(table, "id=\"pand-", x) tweede = find_nth(table, "id=\"pand-", x + 1) list = table[eerste:tweede] #print(list) #list = table.find("div", attrs={"class":"item "}) list = str(list) #id list1 = list.split('\n', 1)[0] id = list1[9:19]
for url in urls2: index = urls2.index(url) page = urllib.request.urlopen(url) soup = BeautifulSoup(page) paraTitle = list(soup.find_all('h3')) for paragraph in paraTitle: para = str(paragraph.nextSibling.nextSibling) paraText = BeautifulSoup(para).get_text() paraText = paraText.strip() if (paraText != ""): capsCount = sum(1 for x in paraText if x.isupper()) rowValue = pd.Series([companies2[index], paraText, len(paraText), paraText.count(' '), capsCount]) df = df.append(rowValue, ignore_index=True) for url in urls3: index = urls3.index(url) page = urllib.request.urlopen(url) soup = BeautifulSoup(page) para = list(soup.find_all('li')) for paragraph in para: paraText = paragraph.get_text() paraText = paraText.strip() if (paraText != ""): capsCount = sum(1 for x in paraText if x.isupper()) rowValue = pd.Series([companies3[index], paraText, len(paraText),
for row in soup.findAll( "row"): # Run through each row and pick apart the call details call = row.findAll("cell") if len(call) == 6: #If the row contains probably good call data calldate = str(call[0].get_text()) calltype = str(call[5].get_text()) address = str(call[2].get_text()) address = address[0:address.rfind(',')] unitids = BeautifulSoup(call[3].get_text(), 'html.parser') unitids = str(unitids.get_text()) unitids = unitids.replace("?", "") unitids = unitids.replace("^", "") latlong = str(call[4].get_text()) lat = float(latlong.split(',')[0]) long = float(latlong.split(',')[1]) numunits = unitids.count(',') + 1 callDesc = "[" + calltype + "] " + address + ": " + unitids + " (" + str( numunits ) + ") @ " + calldate + " http://maps.google.com/?q=" + str( lat) + "," + str(long) savedType, savedFooter = storeCall( ) #Store the call. Reutrn type is 0 for old, 1 for new, 2 for update call. Footer is a string of dbid & what (if any) changed if savedType == 1: #If it is a new call sendTweet(callDesc) if savedType > 0: #If the call is new or updated if numunits > 7 or any( word in callDesc for word in COOLCALL): #Check if it should email sendEmail(savedFooter) elif savedType < 0: sendEmail("\nstoreCall() ERROR!!")
def get_keyword_frequency(html_string): text = BeautifulSoup(str(html_string), "html.parser").text if len(text) == 0: return 0 return sum(text.count(kword) for kword in kwlist)
Ppos = int(row[3]) perA = row[4].split(" ") Apos = int(row[5]) Aans = row[6] perB = row[7].split(" ") Bpos = int(row[8]) Bans = row[9] url = row[10] #try to get information from given url. data from url is saved in Acount and Bcount for trial in range(3): try: html_content = requests.get(url, timeout=(3, 10)).text soup = BeautifulSoup(html_content, "lxml") soup = soup.prettify() Acount = soup.count(perA[0]) Bcount = soup.count(perB[0]) break except: Acount = 0 Bcount = 0 #find all position of preposition, nounA, and nounB. Pord = find_ord(row[1], prep[0]) Aord = find_ord(row[1], perA[0]) Bord = find_ord(row[1], perB[0]) #make list with information that can be get from text for every word #[word, all positions of word, index for previous element to get wanted word] target = [prep, Pord, get_nth(row[1], prep[0], Ppos, Pord)] first = [perA, Aord, get_nth(row[1], perA[0], Apos, Aord)]
busName = dataInJson["results"][k]["name"] busAdd = dataInJson["results"][k]["formatted_address"] busId = dataInJson["results"][k]["place_id"] url2 = urllib.request.urlopen( "https://maps.googleapis.com/maps/api/place/details/json?placeid=" + busId + '&key=' + api_key) data2 = url2.read().decode('utf-8') dataInJson2 = json.loads(data2) phoneNumber = dataInJson2["result"] phoneNumber = phoneNumber.get("formatted_phone_number") street = dataInJson2["result"] street = street.get('adr_address') street = BeautifulSoup(street).text commaCount = street.count(",") testAddress = re.search("^[0-9]\d", street) if testAddress and phoneNumber is not None and commaCount == 3: street = street.split(", ") streetName = street[0] City = street[1] State = street[2].split(" ") StateName = State[0] Zip = State[1] Zip = str(Zip)[:5] if StateName == "NC": new_row = { "Name": busName, "Business Name": busName,
def query(self, q, a): t0 = time.time() ua = UserAgent() #header = {'User-Agent':str(ua.random)} s = requests.Session() url = 'https://www.google.com/search?q={}'.format(q) fpage_list = [] total_res_list = [] nlp_list = [] #webbrowser.open(url) front_page = s.get(url) front_soup = BeautifulSoup(front_page.text, 'html.parser') for script in front_soup(["script", "style"]): script.extract() front_soup = front_soup.get_text().lower() for ans in a: # naive question + answer query = '{} "{}"'.format(q, ans) url = 'https://www.google.com/search?q={}'.format(query) r = s.get(url) soup = BeautifulSoup(r.text, 'html.parser') ''' entities = self.nlp(q) nlp_query = " ".join(['"{}"'.format(entities[i].name) for i in range(len(entities))]) nlp_query += '" {}"'.format(ans) url = 'https://www.google.com/search?q={}'.format(nlp_query) nlp_r = s.get(url) nlp_soup = BeautifulSoup(nlp_r.text, 'html.parser') for script in nlp_soup(["script", "style"]): script.extract() nlp_soup = nlp_soup.get_text().lower() ''' page_count = front_soup.count(ans.lower()) num_results = soup.find('div', {'id': 'resultStats'}).text num_results = [ el for el in num_results.split(" ") if el[0].isdigit() ] #num_results = "".join([c for c in num_results if c.isdigit()]) num_results = 0 if num_results == [] else int( num_results[0].replace(",", "")) #nlp_count = nlp_soup.count(ans.lower()) print('{} -- {} -- {}'.format(ans, num_results, page_count)) #nlp_count)) fpage_list.append(page_count) total_res_list.append(num_results) #nlp_list.append(nlp_count) t1 = time.time() print('Time: {}'.format(t1 - t0)) return {'total_res': total_res_list, 'front_page': fpage_list}
# simple program to search for the number of occurrences of a word within any given webpage. # built for my brother's sentiment analysis project, so that he could check through 500 news articles for the word count of a # preset list of words, to determine emotional bias in the description. import requests from bs4 import BeautifulSoup keywords = input( "List the keywords, separated by commas (without spaces).\n").split(",") while True: page = input("Input the website URL: ") txt = BeautifulSoup(requests.get(page).text, features="lxml").get_text().upper() for i in keywords: print(i + ": " + str(txt.count(i.upper())))
# 股东股本->分红配股 ## http://quotes.money.163.com/f10/fhpg_000001.html#01d05 import requests from bs4 import BeautifulSoup url = 'http://quotes.money.163.com/f10/fhpg_000001.html#01d05' wb_data = requests.get(url) # print(wb_data.text) # html = ''' # <ul> # <li class="item-0" name="one"><a href="www.baidu.com">baidu</a> # <li class="item-1" name="two"><a href="www.alibaba.com">alibaba</a> # ''' # soup = BeautifulSoup(html,'html.parser') # print(soup.li.a.string) soup = BeautifulSoup(wb_data.text,"html.parser") # print(soup) path = '.inner_box' soup = soup.select(path) print(soup.select(" table")) print(soup.count('thead')) # print("####################################################################################") # # trrs = soup.select('tr')[2].select('td') # print(trrs) # for item in trrs: # print(item.text)
posts = api.posts.get("hot",0) for post in posts: comments = api.comments.get(post.id) for comment in comments: if 20 < len(comment.text) < 100: striped_comment = BeautifulSoup(comment.text).text syllables = get_syllables(striped_comment) count = len(syllables) #5 7 5 notation if count == 17: #check if it's ours if comment.author == credentials.login: break #checking for minimum 3 words if striped_comment.count(" ") >= 2: haiku = haikufi(syllables) if haiku: print striped_comment.encode('utf-8') print "http://pikabu.ru/story/empty_" + str(post.id) + "#comment_" + str(comment.id) result = ''.join(haiku) try: with db: db.execute('''INSERT INTO comments(comment, post_id, comment_id) VALUES(?,?,?)''', (result, post.id, comment.id)) api.comments.add(result, post.id, comment.id) except sqlite3.IntegrityError: print('Record already exists') db.close()
stocksplit = 0 dividend = 0 buyback = 0 sharerepurchase = 0 acquire = 0 acquisition = 0 bankrupt = 0 while case <= sufswitchcase: try: urllib.request.urlopen(url + str(case) + suffix2) except urllib.error.URLError: case += 1 else: text = BeautifulSoup(urllib.request.urlopen(url + str(case) + suffix2)).get_text().lower() option = text.count("option") spindashoff = text.count("spin-off") spinoff = text.count("spinoff") spinspaceoff = text.count("spin off") spundashoff = text.count("spun-off") spunoff = text.count("spunoff") spunspaceoff = text.count("spun off") insider = text.count("insider") divest = text.count("divest") fraud = text.count("fraud") call = text.count("call") put = text.count("put") pyramid = text.count("pyramid") mislead = text.count("mislead") misled = text.count("misled") merger = text.count("merger")
''' Simple script to read a web page and create a file with the corresponding formatted ("pretty") html. Example: ./pretty.py http://google.com ''' import sys import requests from bs4 import BeautifulSoup import lxml try: url = sys.argv[1] except IndexError: url = input('Enter url : ') print ('Retrieving: ', url) source = requests.get(url) pretty = BeautifulSoup(source.text, 'lxml').prettify() print('Lines : ', pretty.count('\n')) with open('pretty.html', 'w') as f: f.write(pretty) print('pretty.html written') exit()