def count_with_negation(fin_dict, transcript): """ Count positive and negative words with negation check. Account for simple negation only for positive words. negation is occurring within three words preceding a positive words. """ pos_count = 0 neg_count = 0 pos_words = [] neg_words = [] input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', transcript.lower()) word_count = len(input_words) for i in range(0, word_count): if input_words[i] in fin_dict['Negative']: neg_count += 1 neg_words.append(input_words[i]) if input_words[i] in fin_dict['Positive']: if i >= 3: if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]): neg_count += 1 neg_words.append(input_words[i] + ' (with negation)') else: pos_count += 1 pos_words.append(input_words[i]) elif i == 2: if negated(input_words[i - 1]) or negated(input_words[i - 2]): neg_count += 1 neg_words.append(input_words[i] + ' (with negation)') else: pos_count += 1 pos_words.append(input_words[i]) elif i == 1: if negated(input_words[i - 1]): neg_count += 1 neg_words.append(input_words[i] + ' (with negation)') else: pos_count += 1 pos_words.append(input_words[i]) elif i == 0: pos_count += 1 pos_words.append(input_words[i]) results = [word_count, pos_count, neg_count, pos_words, neg_words] return results
def no(strg): # return int(''.join(filter(str.isdigit, strg))) return re.findall(r"[-+]?\d*\.\d+|\d+", strg)[0]
#### loop for finding the site and email of each resort loop_counter=1 for resort in resorts: # looking for the website ## frist googling for this website search_page =fetch_results(resort +' website',10,'en') search_page =BeautifulSoup(search_page,'lxml') sites_in_results =search_page.findAll('div',{'class':'r'}) sites_in_results =[tag.a['href'] for tag in sites_in_results] resort_site =desired_link(sites_in_results,resort) # looking for email ## frist googling for email search_page =fetch_results(resort+' email',10,'en') emails_in_results =re.findall(email_pattern,search_page) if len(emails_in_results)==0 : print() driver.get(resort_site) sleep(1) search_page=driver.page_source emails_in_results=search_page.findall(email_pattern) if(len(emails_in_results)==0): search_page=BeautifulSoup(search_page,'lxml') contact_us=search_page.findAll('a') contact_us=[tag['href'] for tag in contact_us] contact_us=desired_link(contact_us,'contact us') driver.get(contact_us) emails_in_results=search_page.findall(email_pattern) if(len(emails_in_results)==0):
from bs4 import BeautifulSoup import urllib.request from bs4 import re page = urllib.request.urlopen('http://services.housing.berkeley.edu/FoodPro/dining/static/todaysentrees.asp') soup = BeautifulSoup(page) print(soup) items = soup.find_all('font') print('BREAK') for item in items: print(item) food = re.findall(r'>[\w\s]*<',str(items)) #Looks for items between brackets food_list = [] for item in food: item = item[1:len(item)-1] print(item) food_list.append(item) food_list = food_list[12:] print(food_list)