def gatherSingleDateMultiProc(url,filename): date = dateparser.gatherDateMatch(url) if date == "na": try: date = find_date(url) if date is None: date = "na" with open(filename, 'a') as f: f.write(url+'|'+date+"\n") except Exception as e: print(e) date = "ERROR:{}".format(e) if date is None: date = "na" with open(filename, 'a') as f: f.write(url+'|'+date+"\n") elif date is None: with open(filename, 'a') as f: f.write(url+'|'+"na"+"\n") else: with open(filename, 'a') as f: f.write(url+'|'+date+"\n")
def fetch_feed(self,source_urls,paper,collection_name): feeds = [] count = 0 l = len(paper.articles) print("Total Urls:"+str(len(paper.articles))) if len(paper.articles)== 0: return 0 NewsUtil().printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for article in paper.articles: if any(source_url in article.url for source_url in source_urls): pass else: count = count + 1 NewsUtil().printProgressBar(count, l, prefix='Progress:', suffix='Complete', length=50) continue try: article = NewsPlease.from_url(article.url) date = htmldate.find_date(article.url) title = article.title description = article.description text = article.text if text != "" or text != None or description!=None or title!=None: feed = {"published_date": date, "title": title, "source": article.url, "description": description, "text": text} feeds.append(feed) except Exception as e: print(e) count = count + 1 NewsUtil().printProgressBar(count, l, prefix='Progress:', suffix='Complete', length=50) total_feeds_inserted=Database().insert_data(collection_name, feeds) Database().delete_data(collection_name, {"text": ""}) return total_feeds_inserted
def resultat(soup): text = [] hyperlien = [] date = [] for a in soup.find_all('a', href=True): #if 'tunisair' in a['href']: for pattern in patterns: try: if (re.search(pattern, a['href'])): text.append(get_text_from_url(a['href'])) hyperlien.append(a['href']) d = find_date(a['href']) if d: date.append(d) else: date.append(None) except: continue return hyperlien, text, date
def print_date(column): date = find_date(column) print(date)
from htmldate import find_date import pandas as pd import numpy as np import ipywidgets as widgets from ipywidgets import interact, interact_manual import csv df = pd.read_csv("dates.csv") np_url = df['url'].values.tolist() #print(df) print(np_url) dates = [] for url in np_url: date = find_date(url) dates.append(date) #print(date) for date in dates: print(date) column = widgets.Dropdown(options=list(df['url']),description='Web Link') ui = widgets.HBox([column]) def print_date(column): date = find_date(column) print(date) out = widgets.interactive_output(print_date, {'column': column}) display(ui, out) # df.to_csv('dates.csv')
def run_htmldate_extensive(htmlstring): '''run htmldate on content''' return find_date(htmlstring, original_date=True, extensive_search=True)
def extract_metadata(filecontent, default_url=None, date_config=None): '''Main process for metadata extraction''' # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # correction: author not a name if metadata['author'] is not None: if ' ' not in metadata['author'] or metadata['author'].startswith( 'http'): metadata['author'] = None # fix: try json-ld metadata and override metadata = extract_json(tree, metadata) # try with x-paths # title if metadata['title'] is None: metadata['title'] = extract_title(tree) # author if metadata['author'] is None: metadata['author'] = extract_author(tree) # url if metadata['url'] is None: metadata['url'] = extract_url(tree, default_url) # hostname if metadata['url'] is not None: metadata['hostname'] = extract_domain(metadata['url']) # extract date with external module htmldate if date_config is None: date_config = HTMLDATE_CONFIG date_config['url'] = metadata['url'] try: metadata['date'] = find_date(tree, **date_config) # temporary fix for htmldate bug except UnicodeError: pass # sitename if metadata['sitename'] is None: metadata['sitename'] = extract_sitename(tree) if metadata['sitename'] is not None: if metadata['sitename'].startswith('@'): # scrap Twitter ID metadata['sitename'] = re.sub(r'^@', '', metadata['sitename']) # capitalize try: if not '.' in metadata['sitename'] and not metadata['sitename'][ 0].isupper(): metadata['sitename'] = metadata['sitename'].title() # fix for empty name except IndexError: pass else: # use URL if metadata['url']: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata['url']) if mymatch: metadata['sitename'] = mymatch.group(1) # categories if not metadata['categories']: metadata['categories'] = extract_catstags('category', tree) # tags if not metadata['tags']: metadata['tags'] = extract_catstags('tags', tree) # for safety: length check for key, value in metadata.items(): if value is not None and len(value) > 10000: metadata[key] = value[:9999] + '…' # return return metadata
import requests from htmldate import find_date import time results = requests.get('http://www.ibew.org/IBEW-COE') results.raise_for_status() oldDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c') print("Last time webstie was updated was:", oldDate) print("Checking for updates") # newDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c') # print(oldDate) while True: newDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c') if oldDate != newDate: newDate = oldDate print("Date was updated from ", oldDate, " to ", newDate) time.sleep(86400)
def wrapper_find_date(url): print(url) p = find_date(url) print('Found date for url', p) return p
def validate_date(test): document_y_or_n = test.split('/')[-1] pdf_y_or_n = test.split('.')[-1] if (pdf_y_or_n == 'pdf' or document_y_or_n == 'document'): return '' return find_date(test) if find_date(test) else ""
def site_score(url): res = requests.get(url) html_page = res.content soup = BeautifulSoup(html_page, 'html.parser') text = soup.find_all(text=True) output = '' allowed = [ "p", "span", "li", #"h1", #"h2", #"h3", #"h4", #"h5", "div", "a" ] for t in text: if t.parent.name in allowed: output += '{} '.format(re.sub('[^a-zA-Z0-9-_*. ?:()!]', '', t)) #logo, submit - ezzat #deploy and easy install extension -bill #add question answer system - me #scroll to answer analysisSub = TextBlob(output).subjectivity bias_score = (1 - analysisSub) * 10 authors = re.findall(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+", output) author_backing_score = 10 if len(authors) > 0 else 0 original = find_date(url, original_date=True) update = find_date(url) relevance_score = 0 if (original is not None): today = datetime.datetime.today() date_format = "%Y-%m-%d" original_date = datetime.datetime.strptime(original, date_format) diff = (today - original_date).days relevance_score = 10 * pow(e, (-1 / 8000) * diff) tool = language_check.LanguageTool('en-US') mistakes = len(tool.check(output)) mistakes_to_article = float(mistakes) / len(output) mistakes_score = 10 * pow(e, -20 * mistakes_to_article) domains = {".edu": 10, ".com": 7, ".gov": 10, ".org": 8, ".net": 8} url_score = 0 for i in domains: if (i in url): url_score = domains[i] if (len(url) > 100): url_score -= (len(url) - 100) * 0.1 if ("~" in url): url_score *= 0.6 scores = { "url_score": url_score, "mistakes_score": mistakes_score, "relevance_score": relevance_score, "author_score": author_backing_score, "bias_score": bias_score, "total": (url_score + mistakes_score + relevance_score + author_backing_score + bias_score) / 5 } #print("URL: "+str(url_score)) #print("Mistakes: "+str(mistakes_score)) #print("Relevance: "+str(relevance_score)) #print("Author: " + str(author_backing_score)) #print("Bias: " + str(bias_score)) #print("Total Score: " +total) return scores #site_score("https://www.pbs.org/crucible/tl5.html")
def extract_metadata(filecontent, default_url=None, date_config=None): '''Main process for metadata extraction''' # create named tuple Metadata = namedtuple('Metadata', [ 'title', 'author', 'url', 'description', 'sitename', 'date', 'categories', 'tags' ]) # Metadata.__new__.__defaults__ = (None,) * len(Metadata._fields) # load contents tree = load_html(filecontent) if tree is None: return None # meta tags mymeta = Metadata._make(examine_meta(tree)) # correction: author not a name if mymeta.author is not None: if ' ' not in mymeta.author or mymeta.author.startswith('http'): mymeta = mymeta._replace(author=None) # fix: try json-ld metadata and override mymeta = extract_json(tree, mymeta) # extract date with external module htmldate if date_config is None: date_config = HTMLDATE_CONFIG date_config['url'] = mymeta.url mymeta = mymeta._replace(date=find_date(tree, **date_config)) # try with x-paths # title if mymeta.title is None: mymeta = mymeta._replace(title=extract_title(tree)) # author if mymeta.author is None: mymeta = mymeta._replace(author=extract_author(tree)) # url if mymeta.url is None: mymeta = mymeta._replace(url=extract_url(tree, default_url)) # sitename if mymeta.sitename is None: mymeta = mymeta._replace(sitename=extract_sitename(tree)) if mymeta.sitename is not None: if mymeta.sitename.startswith('@'): # scrap Twitter ID mymeta = mymeta._replace( sitename=re.sub(r'^@', '', mymeta.sitename)) # capitalize if not '.' in mymeta.sitename and not mymeta.sitename[0].isupper(): mymeta = mymeta._replace(sitename=mymeta.sitename.title()) else: # use URL if mymeta.url: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', mymeta.url) if mymatch: mymeta = mymeta._replace(sitename=mymatch.group(1)) # categories if not mymeta.categories: mymeta = mymeta._replace(categories=extract_catstags('category', tree)) # tags if not mymeta.tags: mymeta = mymeta._replace(tags=extract_catstags('tags', tree)) # return return mymeta
def script(text): from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import nltk nltk.download('punkt') nltk.download('stopwords') links_to_add = [] # Use for adding all links to CSV text = pre_process(text) #print(text) #load a set of stop words stopwords = get_stop_words("resources/stopwords.txt") #get the text column docs = list(text.split(" ")) #create a vocabulary of words, #ignore words that appear in 85% of documents, #eliminate stop words cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000) word_count_vector = cv.fit_transform(docs) warn_words = [] warn_words = list(cv.vocabulary_.keys())[:10] #warn_words.append('covid') ''' Check with RSS feeds - similarity score obtained with each website - added to total ''' from nltk.corpus import stopwords urls = [] url1 = "https://blog.amnestyusa.org/feed/" #DESCRIPTION url2 = "https://news.un.org/feed/subscribe/en/news/topic/human-rights/feed/rss.xml" #DESCRIPTION url3 = "https://bhr.stern.nyu.edu/blogs?format=RSS" #DESCRIPTION urls1 = [url1, url2, url3] total = 0 links = [] for everyUrl in urls1: resp = requests.get(everyUrl) soup = BeautifulSoup(resp.content, features="xml") items = soup.findAll('item') # Program to measure the similarity between # two sentences using cosine similarity. first sentence # is from query (text) and second is description for item in items: Y = item.description.text linkForItem = item.link.text # tokenization X_list = word_tokenize(text) Y_list = word_tokenize(Y) # sw contains the list of stopwords sw = set(stopwords.words('english')) l1 = [] l2 = [] # remove stop words from the string X_set = {w for w in X_list if not w in sw} Y_set = {w for w in Y_list if not w in sw} # form a set containing keywords of both strings rvector = X_set.union(Y_set) for w in rvector: if w in X_set: l1.append(1) # create a vector else: l1.append(0) if w in Y_set: l2.append(1) else: l2.append(0) c = 0 # cosine formula for i in range(len(rvector)): c += l1[i] * l2[i] if (((sum(l1) * sum(l2))**0.5) != 0): cosine = c / float((sum(l1) * sum(l2))**0.5) if (cosine > 0): total += 1 print("Yes! Found similar sentence ", cosine) links.append([linkForItem]) print("All Links: ", links) ''' Check with RSS feeds for 2 more urls - similarity score obtained with each website - added to total ''' from nltk.corpus import stopwords url4 = "https://www.nchrd.org/category/news/feed/" #Description has img src etc. unneeded stuff url5 = "https://www.theguardian.com/law/human-rights/rss" #Description has img src etc. unneeded stuff url6 = "https://www.reddit.com/r/humanrights/.rss?format=xml" urls2 = [url4] for everyUrl in urls2: resp = requests.get(everyUrl) soup = BeautifulSoup(resp.content, features="xml") wholeItems = soup.findAll('item') for everyItem in wholeItems: linkForItem = everyItem.link.text p_tags = everyItem.description.text Y = p_tags.replace(']]>', '') # Program to measure the similarity between # two sentences using cosine similarity. first sentence # is from query (text) and second is description # tokenization X_list = word_tokenize(text) Y_list = word_tokenize(Y) # sw contains the list of stopwords sw = set(stopwords.words('english')) l1 = [] l2 = [] # remove stop words from the string X_set = {w for w in X_list if not w in sw} Y_set = {w for w in Y_list if not w in sw} # form a set containing keywords of both strings rvector = X_set.union(Y_set) for w in rvector: if w in X_set: l1.append(1) # create a vector else: l1.append(0) if w in Y_set: l2.append(1) else: l2.append(0) c = 0 # cosine formula for i in range(len(rvector)): c += l1[i] * l2[i] if (((sum(l1) * sum(l2))**0.5) != 0): cosine = c / float((sum(l1) * sum(l2))**0.5) if (cosine > 0): total += 1 print("Yes! Found similar sentence ", cosine) links.append([linkForItem]) print("All Links: ", links) ''' Check with news sites - for news articles with similar keywords, check text similarity and add to the similarity score ''' from nltk.corpus import stopwords import bs4 from bs4 import BeautifulSoup as soup from urllib.request import urlopen import pandas as pd from htmldate import find_date import csv from csv import writer filename = "NEWS.csv" f = open(filename, "a", encoding='utf-8') headers = ["Statement", "Link", "Date"] upperframe = [] news_url = "https://news.google.com/news/rss" Client = urlopen(news_url) xml_page = Client.read() Client.close() soup_page = soup(xml_page, "xml") news_list = soup_page.findAll("item") frame = [] Links = "Links.csv" f1 = open(Links, "a", encoding='utf-8') linkhead = ['Link'] for news in news_list: texts = news.title.text lsts = warn_words for l in lsts: if l in texts: print(news.title.text) print(news.link.text) print("\n") date = find_date(news.link.text) upperframe = [news.title.text, news.link.text, date] frame.append(upperframe) links.append([news.link.text]) break print(links) with open(filename, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(headers) csvwriter.writerows(frame) with open(Links, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(linkhead) csvwriter.writerows(links) for titleNews in frame: title = titleNews[0] # tokenization X_list = word_tokenize(text) Y_list = word_tokenize(Y) # sw contains the list of stopwords sw = stopwords.words('english') l1 = [] l2 = [] # remove stop words from the string X_set = {w for w in X_list if not w in sw} Y_set = {w for w in Y_list if not w in sw} # form a set containing keywords of both strings rvector = X_set.union(Y_set) for w in rvector: if w in X_set: l1.append(1) # create a vector else: l1.append(0) if w in Y_set: l2.append(1) else: l2.append(0) c = 0 # cosine formula for i in range(len(rvector)): c += l1[i] * l2[i] if (((sum(l1) * sum(l2))**0.5) != 0): cosine = c / float((sum(l1) * sum(l2))**0.5) if (cosine > 0): total += 1 print("Yes! Found similar sentence ", cosine) #twitter from nltk.corpus import stopwords #twitter credentials consumer_key = 'H9y4X6T2VD9X1yYC57JGuPDGF' consumer_secret_key = '2MH4iJ8bQ7awBUrtWbg8EF6fvmrMHrmTVYESME8zp3wl2vtyHb' #Reformat the keys and encode them key_secret = '{}:{}'.format(consumer_key, consumer_secret_key).encode('ascii') #Transform from bytes to bytes that can be printed b64_encoded_key = base64.b64encode(key_secret) #Transform from bytes back into Unicode b64_encoded_key = b64_encoded_key.decode('ascii') #authentication base_url = 'https://api.twitter.com/' auth_url = '{}oauth2/token'.format(base_url) auth_headers = { 'Authorization': 'Basic {}'.format(b64_encoded_key), 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' } auth_data = {'grant_type': 'client_credentials'} auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data) print(auth_resp.status_code) #200 indicates successful authentication access_token = auth_resp.json()['access_token'] #auth response stored #print(warn_words) #warn_words = ['killed', 'police', 'brutality', 'systemic', 'racism', 'Covid'] rows = [] twt_total = 0 for ip in warn_words: search_headers = {'Authorization': 'Bearer {}'.format(access_token)} search_params = {'q': ip, 'lang': 'eu', 'result_type': 'mixed'} search_url = 'https://api.twitter.com/1.1/search/tweets.json' search_resp = requests.get(search_url, headers=search_headers, params=search_params) # print(ip+" "+str(search_resp.status_code)) search_data = search_resp.json() # result of search in json format # print(search_data) with open('raw_tweets.json', 'a+', encoding='utf-8') as f: # Will be needed later # data = json.load(f.decode('utf8')) # temp=data['statuses'] # temp.append(data) json.dump(search_data, f, ensure_ascii=False, indent=4 ) # file that has all tweets collected for each keyword with open('raw_tweets_temp.json', 'w', encoding='utf-8') as f: json.dump( search_data, f, ensure_ascii=False, indent=4) # temp file that is overwritten for every keyword # print(ip+" "+str(len(search_data['statuses']))) for i in range(0, len(search_data['statuses'])): # print('tweet number',i+1,'=',search_data['statuses'][i]) row = [] row.append(search_data['statuses'][i]['id']) row.append(search_data['statuses'][i]['id_str']) row.append(ip) row.append(search_data['statuses'][i]['created_at']) row.append(search_data['statuses'][i]['text']) row.append(search_data['statuses'][i]['favorite_count']) row.append(search_data['statuses'][i]['retweet_count']) # print(row) rows.append(row) # print(rows) # fields = ["id","id_str","keyword","created_at","text","likes","retweeted","hashtags"] # with open('tweets.csv', 'w', encoding='utf-8') as f: #to collect tweets over time change permission to a+ and remove writerow fields # csvwriter = csv.writer(f) # csvwriter.writerow(fields) # csvwriter.writerows(rows) cleaned_tweets = [] for row in rows: # print(row[4])#is the text of tweet # extracting hashtags h = [s for s in row[4].split() if s.startswith('#')] # print(h) row.append(h) # add hashtag segmentation here later # forming a separate feature for cleaned tweets # cleaned tweets: don't have stop words, don't have hashtags URLS, Emojis, mentions s = p.clean(row[4]).lower() row.append(s) cleaned_tweets.append(s) s = word_tokenize(s) s = [i for i in s if i not in stopwords.words('english')] row.append(s) fields = [ "id", "id_str", "keyword", "created_at", "tweet_text", "likes", "retweeted", "hashtags", "clean_text_str", "clean_text" ] with open( 'tweets.csv', 'w', encoding='utf-8' ) as f: # to collect tweets over time change permission to a+ and remove writerow fields csvwriter = csv.writer(f) csvwriter.writerow(fields) csvwriter.writerows(rows) print(cleaned_tweets) for i in cleaned_tweets: # Program to measure the similarity between # two sentences using cosine similarity. first sentence # is from query (text) and second is description # tokenization X_list = word_tokenize(text) Y_list = word_tokenize(i) # sw contains the list of stopwords sw = set(stopwords.words('english')) l1 = [] l2 = [] # remove stop words from the string X_set = {w for w in X_list if not w in sw} Y_set = {w for w in Y_list if not w in sw} # form a set containing keywords of both strings rvector = X_set.union(Y_set) for w in rvector: if w in X_set: l1.append(1) # create a vector else: l1.append(0) if w in Y_set: l2.append(1) else: l2.append(0) c = 0 # cosine formula for i in range(len(rvector)): c += l1[i] * l2[i] if (((sum(l1) * sum(l2))**0.5) != 0): cosine = c / float((sum(l1) * sum(l2))**0.5) if (cosine > 0): twt_total += 1 print("Yes! Found similar sentence ", cosine) #links.append([linkForItem]) print("Twitter Total ", twt_total) return links, twt_total
def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=False, author_blacklist=None): """Main process for metadata extraction. Args: filecontent: HTML code as string. default_url: Previously known URL of the downloaded document. date_config: Provide extraction parameters to htmldate as dict(). author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. Returns: A dict() containing the extracted metadata information or None. """ # init if author_blacklist is None: author_blacklist = set() # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # to check: remove it and replace with author_blacklist in test case if metadata.author is not None and ' ' not in metadata.author: metadata.author = None # fix: try json-ld metadata and override try: metadata = extract_meta_json(tree, metadata) # todo: fix bugs in json_metadata.py except TypeError as err: LOGGER.warning('error in JSON metadata extraction: %s', err) # try with x-paths # title if metadata.title is None: metadata.title = extract_title(tree) # check author in blacklist if metadata.author is not None and len(author_blacklist) > 0: metadata.author = check_authors(metadata.author, author_blacklist) # author if metadata.author is None: metadata.author = extract_author(tree) # recheck author in blacklist if metadata.author is not None and len(author_blacklist) > 0: metadata.author = check_authors(metadata.author, author_blacklist) # url if metadata.url is None: metadata.url = extract_url(tree, default_url) # hostname if metadata.url is not None: metadata.hostname = extract_domain(metadata.url) # extract date with external module htmldate if date_config is None: # decide on fast mode if fastmode is False: date_config = HTMLDATE_CONFIG_EXTENSIVE else: date_config = HTMLDATE_CONFIG_FAST date_config['url'] = metadata.url metadata.date = find_date(tree, **date_config) # sitename if metadata.sitename is None: metadata.sitename = extract_sitename(tree) if metadata.sitename is not None: if metadata.sitename.startswith('@'): # scrap Twitter ID metadata.sitename = re.sub(r'^@', '', metadata.sitename) # capitalize try: if ( '.' not in metadata.sitename and not metadata.sitename[0].isupper() ): metadata.sitename = metadata.sitename.title() # fix for empty name except IndexError as err: LOGGER.warning('error in sitename extraction: %s', err) # use URL elif metadata.url: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata.url) if mymatch: metadata.sitename = mymatch.group(1) # categories if not metadata.categories: metadata.categories = extract_catstags('category', tree) # tags if not metadata.tags: metadata.tags = extract_catstags('tag', tree) # license metadata.license = extract_license(tree) # safety checks metadata.clean_and_trim() # return result return metadata
def get_date(urlinput): return find_date(urlinput)
def run_htmldate_fast(htmlstring): '''run htmldate on content''' result = find_date(htmlstring, original_date=True, extensive_search=False) return result
def Get_Date_Created(url_input): try: date = find_date(url_input) return date except: return None
mycursor.execute("create TABEL scraped") my_url='https://innovaccer.com/news/' context = ssl._create_unverified_context() uClient = urlopen(my_url, context=context) page_html = uClient.read() uClient.close() page_soup = soup(page_html,"html.parser") containers = page_soup.findAll("div",{"class":"col-lg-8"}) for container in containers: heading = container.h6.text description = container.p link = container.a["href"] date= htmldate.find_date(container.text) cursor.execute("Insert into scraped(heading,description,link,date) values(?, ?, ?, ?)",(heading, description, link, date)) print( "heading :" + heading ) print("link : " + str(my_url) + str(link) ) print("date : " , date)
def extract_metadata(filecontent, default_url=None, date_config=None): """Main process for metadata extraction. Args: filecontent: HTML code as string. default_url: Previously known URL of the downloaded document. date_config: Provide extraction parameters to htmldate as dict(). Returns: A dict() containing the extracted metadata information or None. """ # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # correction: author not a name if metadata['author'] is not None: if ' ' not in metadata['author'] or metadata['author'].startswith( 'http'): metadata['author'] = None # fix: try json-ld metadata and override metadata = extract_json(tree, metadata) # try with x-paths # title if metadata['title'] is None: metadata['title'] = extract_title(tree) # author if metadata['author'] is None: metadata['author'] = extract_author(tree) # url if metadata['url'] is None: metadata['url'] = extract_url(tree, default_url) # hostname if metadata['url'] is not None: metadata['hostname'] = extract_domain(metadata['url']) # extract date with external module htmldate if date_config is None: date_config = HTMLDATE_CONFIG date_config['url'] = metadata['url'] try: metadata['date'] = find_date(tree, **date_config) # temporary fixes for htmldate bugs # todo: remove later except (TypeError, UnicodeError): pass # sitename if metadata['sitename'] is None: metadata['sitename'] = extract_sitename(tree) if metadata['sitename'] is not None: if metadata['sitename'].startswith('@'): # scrap Twitter ID metadata['sitename'] = re.sub(r'^@', '', metadata['sitename']) # capitalize try: if not '.' in metadata['sitename'] and not metadata['sitename'][ 0].isupper(): metadata['sitename'] = metadata['sitename'].title() # fix for empty name except IndexError: pass else: # use URL if metadata['url']: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata['url']) if mymatch: metadata['sitename'] = mymatch.group(1) # categories if not metadata['categories']: metadata['categories'] = extract_catstags('category', tree) # tags if not metadata['tags']: metadata['tags'] = extract_catstags('tags', tree) # license for element in tree.xpath('//a[@rel="license"]', ): if element.text is not None: metadata['license'] = trim(element.text) break # for safety: length check for key, value in metadata.items(): if value is not None and len(value) > 10000: metadata[key] = value[:9999] + '…' # remove spaces and control characters for item in metadata: if metadata[item] is not None and isinstance(metadata[item], str): metadata[item] = line_processing(metadata[item]) # return return metadata
def extract_date(tree, url): '''Extract the date using external module htmldate''' docdate = find_date(tree, extensive_search=False, url=url) return docdate
def get_date(data): #this function gets the last modified date of a website x = find_date(data) print(x)
def getDate(htmldoc): mytree = html.fromstring(htmldoc) return find_date(mytree, outputformat='%Y-%m-%d %H:%M')
def SearchAndSave( browser, # logged-in with specific user behavior user, #first_hon_click, first_non_click, time_click, readability_click, no_click, average_click data='../data/queries2019_400.csv', # path to queries max_num_res=8, # number of results ): # global varaible in order to users search same queries in the same order. Incremented every day global RANDOM_STATE # get all queries and choose which to search queries_all = pd.read_csv(data, sep=',') queries_today = np.asarray(queries_all.iloc[RANDOM_STATE:RANDOM_STATE + 2]).flatten() # CHANGE + 2 random.Random(RANDOM_STATE).shuffle(queries_today) #parameters and variables dict_results = { "QUERY":[], "RANK": [], "URL": [], 'HON': [], 'SMOG': [], 'PUBLICATION_DATE': [], 'CLICKS&TIME': [], } WINDOW_SIZE = 540 # size of scrolling window READ_TIME = 300 # this value should be 60-300 # CHANGE 300 WAIT_TIME = 10 # this value should be < 60 # CHANGE 10 CARRY_OVER_TIME= 660 URL_PROBAB = [0.32, 0.25, 0.18, 0.11, 0.07, 0.04, 0.02, 0.01 ] # average statistic on clickimg google results # search for today queries for query in queries_today: # start counting time for one search, which should be at least 11 min start_time = time.time() # find all elements provided on google search results browser.get("https://google.com") browser.find_element_by_xpath("//input[@name='q']").send_keys(query + Keys.ENTER) # wait to upload HON-labels time.sleep(WAIT_TIME) browser.execute_script("window.scrollTo(0, window.scrollY + {0})".format(2 * WINDOW_SIZE)) # get number of google results num_res_google = browser.find_element_by_xpath('//*[@id="result-stats"]').text try: num_res_int = int(re.search('About (.*) results', num_res_google).group(1).replace(',', '')) except: try: num_res_int = int(re.search('(.*) results', num_res_google).group(1).replace(',', '')) except: pass # get time of the start query search start_query_time = datetime.now().strftime("%H:%M:%S") # reset variables num_res=0 search_results = [] # get first 8 URLs and HON-labels from the SERP elements = browser.find_elements_by_class_name('rc') try: first_element = browser.find_element_by_css_selector("div[class='g mnr-c g-blk']") temp1 = first_element.find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href') temp2 = first_element.find_element_by_css_selector('div[target="_blank"]').get_attribute('title') try: temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d') except: try: temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15' except: temp3 = None search_results.append([temp1, temp2, temp3]) num_res+=1 except: try: first_element = browser.find_elements_by_class_name('g')[0] temp1 = first_element.find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href') temp2 = first_element.find_element_by_css_selector('div[target="_blank"]').get_attribute('title') try: temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d') except: try: temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15' except: temp3 = None search_results.append([temp1, temp2, temp3]) num_res+=1 except: pass temp = num_res try: related_questions = browser.find_element_by_css_selector("div[class='g kno-kp mnr-c g-blk']") temp += len(related_questions.find_elements_by_css_selector("div[class='related-question-pair']")) except: pass for j in range(temp, len(elements)): temp1 = elements[j].find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href') temp2 = elements[j].find_element_by_css_selector("div[class='r']").find_element_by_css_selector('div[target="_blank"]').get_attribute('title') try: temp3 = elements[j].find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d') except: try: temp3 = elements[j].find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15' except: temp3 = None search_results.append([temp1, temp2, temp3]) num_res+=1 if num_res==max_num_res: break # get user-agent of the browser headers = requests.utils.default_headers() headers['User-Agent'] = browser.execute_script("return navigator.userAgent;") # get SMOG grades for all URLs and publication date for i in range(len(search_results)): if search_results[i][0][-3:]=='pdf': # extract publication date using htmldate library try: if search_results[i][2] == None: search_results[i][2] == '1998-09-04' # extract SMOG-grade urlretrieve(search_results[i][0], '../../ResultsISU/temp.pdf') #Write a for-loop to open many files (leave a comment if you'd like to learn how). filename = '../../ResultsISU/temp.pdf' #open allows you to read the file. pdfFileObj = open(filename,'rb') #The pdfReader variable is a readable object that will be parsed. pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Discerning the number of pages will allow us to parse through all the pages. num_pages = pdfReader.numPages count = 0 full_text = '' #The while loop will read each page. while count < num_pages: pageObj = pdfReader.getPage(count) count +=1 full_text += pageObj.extractText() smog_grade = textstat.smog_index(full_text) except: smog_grade = 0.0 else: # extract publication date using htmldate library if search_results[i][2] == None: search_results[i][2] = find_date(search_results[i][0]) # extract SMOG-grade try: response = requests.get(search_results[i][0], headers=headers) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) full_text = '' for paragraph in paragraphs: if not paragraph.is_boilerplate: full_text += '\n' + paragraph.text if full_text=='' and '.'.join(tldextract.extract(search_results[i][0])) != 'www.youtube.com': for paragraph in paragraphs: if not paragraph.is_heading: full_text += '\n' + paragraph.text smog_grade = textstat.smog_index(full_text) except: try: response = requests.get(search_results[i][0], verify=False, headers=headers) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) full_text = '' for paragraph in paragraphs: if not paragraph.is_boilerplate: full_text += '\n' + paragraph.text if full_text=='' and '.'.join(tldextract.extract(search_results[i][0])) != 'www.youtube.com': for paragraph in paragraphs: if not paragraph.is_heading: full_text += '\n' + paragraph.text smog_grade = textstat.smog_index(full_text) except: smog_grade = 0.0 # append SMOG and results if smog_grade > 0.0: search_results[i].append(smog_grade) else: search_results[i].append(100) # edit PUBLICATION DATE results if search_results[i][2] == None or search_results[i][2] == '2020-01-01' or search_results[i][2] == '2020-01-24': search_results[i][2] = '1998-09-04' # get clicks clicked_urls = [''] * len(search_results) ########################################################################################################################## ######## ISU with first HON-certified result preferences if user == 'first_hon_click': try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='HONcode certified')[0][0] first_hon_url = search_results[ind][0] first_hon_element = browser.find_element_by_xpath('//a[@href="'+first_hon_url+'"]') # click on first HON-certified URL first_hon_element.click() clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='HONcode certified')[0][0] first_hon_url = search_results[ind][0] first_hon_element = browser.find_element_by_xpath('//a[@href="'+first_hon_url+'"]') # click on first HON-certified URL browser.execute_script("arguments[0].click();", first_hon_element) clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) time.sleep(5) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is no HON-certified result - DO NOTHING browser.get("https://google.com") ######## ISU with first NON-certified result preferences if user == 'first_non_click': try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='')[0][0] first_non_url = search_results[ind][0] first_non_element = browser.find_element_by_xpath('//a[@href="'+first_non_url+'"]') # click on first NON HON-certified URL first_non_element.click() clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='')[0][0] first_non_url = search_results[ind][0] first_non_element = browser.find_element_by_xpath('//a[@href="'+first_non_url+'"]') # click on first HON-certified URL browser.execute_script("arguments[0].click();", first_non_element) clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) time.sleep(5) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is no HON-certified result - DO NOTHING browser.get("https://google.com") ######## ISU with first HON-certified result preferences and complex behavior if user == 'complex_hon_click': # search for the first three URLs is_found = False for ind in range(3): if search_results[ind][1] =='HONcode certified': try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: pass is_found = True break elif search_results[ind][1] =='': try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) #do not scroll the page time.sleep(WAIT_TIME) browser.back() except: try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) #do not scroll the page time.sleep(WAIT_TIME) browser.back() except: pass time.sleep(WAIT_TIME / 2) if not is_found: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='HONcode certified')[0][0] click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='HONcode certified')[0][0] click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is no HON-certified result - DO NOTHING browser.get("https://google.com") ######## ISU with first NON-certified result preferences and complex behavior if user == 'complex_non_click': # search for the first three URLs is_found = False for ind in range(3): if search_results[ind][1] =='': try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: pass is_found = True break elif search_results[ind][1] =='HONcode certified': try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) #do not scroll the page time.sleep(WAIT_TIME) browser.back() except: try: click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S")) #do not scroll the page time.sleep(WAIT_TIME) browser.back() except: pass time.sleep(WAIT_TIME / 2) if not is_found: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='')[0][0] click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') click_element.click() clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result ind = np.where(np.array(search_results)=='')[0][0] click_url = search_results[ind][0] click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]') browser.execute_script("arguments[0].click();", click_element) clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S")) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is no non-HON-certified result - DO NOTHING browser.get("https://google.com") ######## ISU with clicking based on the average statistic user behavior if user == 'average_click': try: # get index and then url of wanted result ind = int(np.random.choice(len(search_results), 1, p=URL_PROBAB)) average_url = search_results[ind][0] average_element = browser.find_element_by_xpath('//a[@href="' + average_url + '"]') # click on average (according to given stistic) URL average_element.click() clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # then scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result ind = int(np.random.choice(len(search_results), 1, p=URL_PROBAB)) average_url = search_results[ind][0] average_element = browser.find_element_by_xpath('//a[@href="' + average_url + '"]') # click on average (according to given stistic) URL browser.execute_script("arguments[0].click();", average_statistic_element) clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # then scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is an error - DO NOTHING browser.get("https://google.com") ######## ISU without clicking/browsing, only search history if user == 'no_click': # just my favourite DO NOTHING time.sleep(WAIT_TIME) browser.get("https://google.com") ######## ISU without clicking/browsing, only search history if user == 'time_click': try: # get index and then url of wanted result best_time = max(map(lambda x: datetime.strptime(x[3], '%Y-%m-%d') , search_results)) ind = np.where(np.array(search_results)==str(best_time.strftime('%Y-%m-%d')))[0][0] best_time_url = search_results[ind][0] best_time_element = browser.find_element_by_xpath('//a[@href="'+best_time_url+'"]') # click on the earliest publication date URL best_time_element.click() clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result best_time = max(map(lambda x: datetime.strptime(x[3], '%Y-%m-%d') , search_results)) ind = np.where(np.array(search_results)==str(best_time.strftime('%Y-%m-%d')))[0][0] best_time_url = search_results[ind][0] best_time_element = browser.find_element_by_xpath('//a[@href="'+best_time_url+'"]') # click on the earliest publication date URL browser.execute_script("arguments[0].click();", best_time_element) clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is an error - DO NOTHING browser.get("https://google.com") ######## ISU with easy-to-read preferences if user == 'readability_click': try: # get index and then url of wanted result best_smog_grade = min(map(lambda x: x[2] , search_results)) ind = np.where(np.array(search_results)==str(best_smog_grade))[0][0] best_smog_url = search_results[ind][0] best_smog_element = browser.find_element_by_xpath('//a[@href="'+best_smog_url+'"]') # click on the lowest SMOG-grade URL best_smog_element.click() clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # then scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: try: # get index and then url of wanted result best_smog_grade = min(map(lambda x: x[2] , search_results)) ind = np.where(np.array(search_results)==str(best_smog_grade))[0][0] best_smog_url = search_results[ind][0] best_smog_element = browser.find_element_by_xpath('//a[@href="'+best_smog_url+'"]') # click on the lowest SMOG-grade URL browser.execute_script("arguments[0].click();", best_smog_element) clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") ) # scrolling the page ScrollAndRead(browser, WINDOW_SIZE, READ_TIME) # after spending READ_TIME on website go to google.com browser.get("https://google.com") except: # if there is an error - DO NOTHING browser.get("https://google.com") ########################################################################################################################## # get time of the end query search end_query_time = datetime.now().strftime("%H:%M:%S") # save results in a dict dict_results["QUERY"].extend([query, 'start @ {}'.format(start_query_time), 'end @ {}'.format(end_query_time), 'results # {}'.format(num_res_int), '', '', '', '' ] ) dict_results["RANK"].extend([1,2,3,4,5,6,7,8]) dict_results["URL"].extend(np.array(search_results)[:,0]) dict_results["HON"].extend(np.array(search_results)[:,1]) dict_results["SMOG"].extend(np.array(search_results)[:,3]) dict_results["PUBLICATION_DATE"].extend(np.array(search_results)[:,2]) dict_results["CLICKS&TIME"].extend(clicked_urls) # wait 11 minutes to avoid a carry-over effect end_time = time.time() session_time = end_time - start_time CARRY_OVER_TIME = 660 # CHANGE 100 if session_time < CARRY_OVER_TIME: time.sleep(CARRY_OVER_TIME - session_time) ########################################################################################################################## # create pandas DataFrame with all results df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_results.items() ])) # create folder with the date day_today = datetime.today().strftime("%b-%d-%Y") Path('../../ResultsISU/{0}'.format(day_today)).mkdir(parents=True, exist_ok=True) df.to_csv('../../ResultsISU/{0}/{1}.csv'.format(day_today,user), index=False) #CHANGE # incrementing global variable RANDOM_STATE+=1