Esempio n. 1
0
 def gatherSingleDateMultiProc(url,filename):
     date = dateparser.gatherDateMatch(url)
     if date == "na":
         try:
             date = find_date(url)
             if date is None:
                 date = "na"
             with open(filename, 'a') as f:
                 f.write(url+'|'+date+"\n")
         except Exception as e:
             print(e)
             date = "ERROR:{}".format(e)
             if date is None:
                 date = "na"
             with open(filename, 'a') as f:
                 f.write(url+'|'+date+"\n")
     elif date is None:
         with open(filename, 'a') as f:
             f.write(url+'|'+"na"+"\n")
     else:
         with open(filename, 'a') as f:
             f.write(url+'|'+date+"\n")
    def fetch_feed(self,source_urls,paper,collection_name):
        feeds = []
        count = 0
        l = len(paper.articles)
        print("Total Urls:"+str(len(paper.articles)))
        if len(paper.articles)== 0:
            return 0
        NewsUtil().printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50)
        for article in  paper.articles:
            if any(source_url in article.url for source_url in source_urls):
                pass
            else:
                count = count + 1
                NewsUtil().printProgressBar(count, l, prefix='Progress:', suffix='Complete', length=50)
                continue
            try:
                article = NewsPlease.from_url(article.url)
                date = htmldate.find_date(article.url)
                title = article.title
                description = article.description
                text = article.text
                if text != "" or text != None or description!=None or title!=None:
                    feed = {"published_date": date,
                                  "title": title,
                                  "source": article.url,
                                  "description": description,
                                  "text": text}
                    feeds.append(feed)

            except Exception as e:
                print(e)
            count = count + 1
            NewsUtil().printProgressBar(count, l, prefix='Progress:', suffix='Complete', length=50)
        total_feeds_inserted=Database().insert_data(collection_name, feeds)
        Database().delete_data(collection_name, {"text": ""})
        return total_feeds_inserted
Esempio n. 3
0
def resultat(soup):
    text = []
    hyperlien = []
    date = []
    for a in soup.find_all('a', href=True):

        #if 'tunisair' in a['href']:
        for pattern in patterns:
            try:
                if (re.search(pattern, a['href'])):

                    text.append(get_text_from_url(a['href']))
                    hyperlien.append(a['href'])
                    d = find_date(a['href'])
                    if d:
                        date.append(d)

                    else:
                        date.append(None)

            except:
                continue

    return hyperlien, text, date
Esempio n. 4
0
def print_date(column):
    date = find_date(column)
    print(date)
Esempio n. 5
0
from htmldate import find_date
import pandas as pd
import numpy as np 
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import csv
df = pd.read_csv("dates.csv")
np_url = df['url'].values.tolist()
#print(df)
print(np_url)

dates = []
for url in np_url:
    date = find_date(url)
    dates.append(date)
    #print(date)
    for date in dates:
        print(date)

column = widgets.Dropdown(options=list(df['url']),description='Web Link')
ui = widgets.HBox([column])
def print_date(column):
    date = find_date(column)
    print(date)

out = widgets.interactive_output(print_date, {'column': column})

display(ui, out)

# df.to_csv('dates.csv')   
Esempio n. 6
0
def run_htmldate_extensive(htmlstring):
    '''run htmldate on content'''
    return find_date(htmlstring, original_date=True, extensive_search=True)
Esempio n. 7
0
def extract_metadata(filecontent, default_url=None, date_config=None):
    '''Main process for metadata extraction'''
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # correction: author not a name
    if metadata['author'] is not None:
        if ' ' not in metadata['author'] or metadata['author'].startswith(
                'http'):
            metadata['author'] = None
    # fix: try json-ld metadata and override
    metadata = extract_json(tree, metadata)
    # try with x-paths
    # title
    if metadata['title'] is None:
        metadata['title'] = extract_title(tree)
    # author
    if metadata['author'] is None:
        metadata['author'] = extract_author(tree)
    # url
    if metadata['url'] is None:
        metadata['url'] = extract_url(tree, default_url)
    # hostname
    if metadata['url'] is not None:
        metadata['hostname'] = extract_domain(metadata['url'])
    # extract date with external module htmldate
    if date_config is None:
        date_config = HTMLDATE_CONFIG
    date_config['url'] = metadata['url']
    try:
        metadata['date'] = find_date(tree, **date_config)
    # temporary fix for htmldate bug
    except UnicodeError:
        pass
    # sitename
    if metadata['sitename'] is None:
        metadata['sitename'] = extract_sitename(tree)
    if metadata['sitename'] is not None:
        if metadata['sitename'].startswith('@'):
            # scrap Twitter ID
            metadata['sitename'] = re.sub(r'^@', '', metadata['sitename'])
        # capitalize
        try:
            if not '.' in metadata['sitename'] and not metadata['sitename'][
                    0].isupper():
                metadata['sitename'] = metadata['sitename'].title()
        # fix for empty name
        except IndexError:
            pass
    else:
        # use URL
        if metadata['url']:
            mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)',
                               metadata['url'])
            if mymatch:
                metadata['sitename'] = mymatch.group(1)
    # categories
    if not metadata['categories']:
        metadata['categories'] = extract_catstags('category', tree)
    # tags
    if not metadata['tags']:
        metadata['tags'] = extract_catstags('tags', tree)
    # for safety: length check
    for key, value in metadata.items():
        if value is not None and len(value) > 10000:
            metadata[key] = value[:9999] + '…'
    # return
    return metadata
Esempio n. 8
0
import requests
from htmldate import find_date
import time

results = requests.get('http://www.ibew.org/IBEW-COE')
results.raise_for_status()
oldDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c')
print("Last time webstie was updated was:", oldDate)
print("Checking for updates")
# newDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c')
# print(oldDate)
while True:
    newDate = find_date('http://www.ibew.org/IBEW-COE', outputformat='%c')
    if oldDate != newDate:
        newDate = oldDate
        print("Date was updated from ", oldDate, " to ", newDate)
    time.sleep(86400)
def wrapper_find_date(url):
    print(url)
    p = find_date(url)
    print('Found date for url', p)
    return p
Esempio n. 10
0
def validate_date(test):
    document_y_or_n = test.split('/')[-1]
    pdf_y_or_n = test.split('.')[-1]
    if (pdf_y_or_n == 'pdf' or document_y_or_n == 'document'):
        return ''
    return find_date(test) if find_date(test) else ""
Esempio n. 11
0
def site_score(url):
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    output = ''

    allowed = [
        "p",
        "span",
        "li",
        #"h1",
        #"h2",
        #"h3",
        #"h4",
        #"h5",
        "div",
        "a"
    ]
    for t in text:
        if t.parent.name in allowed:
            output += '{} '.format(re.sub('[^a-zA-Z0-9-_*. ?:()!]', '', t))
    #logo, submit - ezzat
    #deploy and easy install extension -bill
    #add question answer system - me
    #scroll to answer
    analysisSub = TextBlob(output).subjectivity
    bias_score = (1 - analysisSub) * 10

    authors = re.findall(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+",
                         output)
    author_backing_score = 10 if len(authors) > 0 else 0
    original = find_date(url, original_date=True)
    update = find_date(url)
    relevance_score = 0
    if (original is not None):
        today = datetime.datetime.today()
        date_format = "%Y-%m-%d"
        original_date = datetime.datetime.strptime(original, date_format)
        diff = (today - original_date).days

        relevance_score = 10 * pow(e, (-1 / 8000) * diff)

    tool = language_check.LanguageTool('en-US')
    mistakes = len(tool.check(output))

    mistakes_to_article = float(mistakes) / len(output)
    mistakes_score = 10 * pow(e, -20 * mistakes_to_article)

    domains = {".edu": 10, ".com": 7, ".gov": 10, ".org": 8, ".net": 8}
    url_score = 0
    for i in domains:
        if (i in url):
            url_score = domains[i]
    if (len(url) > 100):
        url_score -= (len(url) - 100) * 0.1
    if ("~" in url):
        url_score *= 0.6
    scores = {
        "url_score":
        url_score,
        "mistakes_score":
        mistakes_score,
        "relevance_score":
        relevance_score,
        "author_score":
        author_backing_score,
        "bias_score":
        bias_score,
        "total": (url_score + mistakes_score + relevance_score +
                  author_backing_score + bias_score) / 5
    }
    #print("URL: "+str(url_score))
    #print("Mistakes: "+str(mistakes_score))
    #print("Relevance: "+str(relevance_score))
    #print("Author: " + str(author_backing_score))
    #print("Bias: " + str(bias_score))
    #print("Total Score: " +total)
    return scores


#site_score("https://www.pbs.org/crucible/tl5.html")
Esempio n. 12
0
def extract_metadata(filecontent, default_url=None, date_config=None):
    '''Main process for metadata extraction'''
    # create named tuple
    Metadata = namedtuple('Metadata', [
        'title', 'author', 'url', 'description', 'sitename', 'date',
        'categories', 'tags'
    ])
    # Metadata.__new__.__defaults__ = (None,) * len(Metadata._fields)
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # meta tags
    mymeta = Metadata._make(examine_meta(tree))
    # correction: author not a name
    if mymeta.author is not None:
        if ' ' not in mymeta.author or mymeta.author.startswith('http'):
            mymeta = mymeta._replace(author=None)
    # fix: try json-ld metadata and override
    mymeta = extract_json(tree, mymeta)
    # extract date with external module htmldate
    if date_config is None:
        date_config = HTMLDATE_CONFIG
    date_config['url'] = mymeta.url
    mymeta = mymeta._replace(date=find_date(tree, **date_config))
    # try with x-paths
    # title
    if mymeta.title is None:
        mymeta = mymeta._replace(title=extract_title(tree))
    # author
    if mymeta.author is None:
        mymeta = mymeta._replace(author=extract_author(tree))
    # url
    if mymeta.url is None:
        mymeta = mymeta._replace(url=extract_url(tree, default_url))
    # sitename
    if mymeta.sitename is None:
        mymeta = mymeta._replace(sitename=extract_sitename(tree))
    if mymeta.sitename is not None:
        if mymeta.sitename.startswith('@'):
            # scrap Twitter ID
            mymeta = mymeta._replace(
                sitename=re.sub(r'^@', '', mymeta.sitename))
        # capitalize
        if not '.' in mymeta.sitename and not mymeta.sitename[0].isupper():
            mymeta = mymeta._replace(sitename=mymeta.sitename.title())
    else:
        # use URL
        if mymeta.url:
            mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)',
                               mymeta.url)
            if mymatch:
                mymeta = mymeta._replace(sitename=mymatch.group(1))
    # categories
    if not mymeta.categories:
        mymeta = mymeta._replace(categories=extract_catstags('category', tree))
    # tags
    if not mymeta.tags:
        mymeta = mymeta._replace(tags=extract_catstags('tags', tree))
    # return
    return mymeta
Esempio n. 13
0
def script(text):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
    links_to_add = []  # Use for adding all links to CSV

    text = pre_process(text)
    #print(text)

    #load a set of stop words
    stopwords = get_stop_words("resources/stopwords.txt")

    #get the text column
    docs = list(text.split(" "))

    #create a vocabulary of words,
    #ignore words that appear in 85% of documents,
    #eliminate stop words
    cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)
    word_count_vector = cv.fit_transform(docs)
    warn_words = []
    warn_words = list(cv.vocabulary_.keys())[:10]
    #warn_words.append('covid')
    '''
    Check with RSS feeds
    - similarity score obtained with each website
    - added to total
    '''
    from nltk.corpus import stopwords
    urls = []
    url1 = "https://blog.amnestyusa.org/feed/"  #DESCRIPTION
    url2 = "https://news.un.org/feed/subscribe/en/news/topic/human-rights/feed/rss.xml"  #DESCRIPTION
    url3 = "https://bhr.stern.nyu.edu/blogs?format=RSS"  #DESCRIPTION
    urls1 = [url1, url2, url3]
    total = 0
    links = []
    for everyUrl in urls1:
        resp = requests.get(everyUrl)
        soup = BeautifulSoup(resp.content, features="xml")
        items = soup.findAll('item')
        # Program to measure the similarity between
        # two sentences using cosine similarity. first sentence
        # is from query (text) and second is description
        for item in items:
            Y = item.description.text
            linkForItem = item.link.text

            # tokenization
            X_list = word_tokenize(text)
            Y_list = word_tokenize(Y)

            # sw contains the list of stopwords
            sw = set(stopwords.words('english'))
            l1 = []
            l2 = []

            # remove stop words from the string
            X_set = {w for w in X_list if not w in sw}
            Y_set = {w for w in Y_list if not w in sw}

            # form a set containing keywords of both strings
            rvector = X_set.union(Y_set)
            for w in rvector:
                if w in X_set: l1.append(1)  # create a vector
                else: l1.append(0)
                if w in Y_set: l2.append(1)
                else: l2.append(0)
            c = 0

            # cosine formula
            for i in range(len(rvector)):
                c += l1[i] * l2[i]
            if (((sum(l1) * sum(l2))**0.5) != 0):
                cosine = c / float((sum(l1) * sum(l2))**0.5)
            if (cosine > 0):
                total += 1
                print("Yes! Found similar sentence ", cosine)
                links.append([linkForItem])
    print("All Links: ", links)
    '''
    Check with RSS feeds for 2 more urls
    - similarity score obtained with each website
    - added to total
    '''
    from nltk.corpus import stopwords
    url4 = "https://www.nchrd.org/category/news/feed/"  #Description has img src etc. unneeded stuff
    url5 = "https://www.theguardian.com/law/human-rights/rss"  #Description has img src etc. unneeded stuff
    url6 = "https://www.reddit.com/r/humanrights/.rss?format=xml"
    urls2 = [url4]

    for everyUrl in urls2:
        resp = requests.get(everyUrl)
        soup = BeautifulSoup(resp.content, features="xml")
        wholeItems = soup.findAll('item')
        for everyItem in wholeItems:
            linkForItem = everyItem.link.text
            p_tags = everyItem.description.text
            Y = p_tags.replace(']]>', '')

            # Program to measure the similarity between
            # two sentences using cosine similarity. first sentence
            # is from query (text) and second is description

            # tokenization
            X_list = word_tokenize(text)
            Y_list = word_tokenize(Y)

            # sw contains the list of stopwords
            sw = set(stopwords.words('english'))
            l1 = []
            l2 = []

            # remove stop words from the string
            X_set = {w for w in X_list if not w in sw}
            Y_set = {w for w in Y_list if not w in sw}

            # form a set containing keywords of both strings
            rvector = X_set.union(Y_set)
            for w in rvector:
                if w in X_set: l1.append(1)  # create a vector
                else: l1.append(0)
                if w in Y_set: l2.append(1)
                else: l2.append(0)
            c = 0

            # cosine formula
            for i in range(len(rvector)):
                c += l1[i] * l2[i]
            if (((sum(l1) * sum(l2))**0.5) != 0):
                cosine = c / float((sum(l1) * sum(l2))**0.5)
            if (cosine > 0):
                total += 1
                print("Yes! Found similar sentence ", cosine)
                links.append([linkForItem])
    print("All Links: ", links)
    '''
    Check with news sites
    - for news articles with similar keywords, check text similarity and add to the similarity score
    '''
    from nltk.corpus import stopwords
    import bs4
    from bs4 import BeautifulSoup as soup
    from urllib.request import urlopen
    import pandas as pd
    from htmldate import find_date
    import csv
    from csv import writer

    filename = "NEWS.csv"
    f = open(filename, "a", encoding='utf-8')
    headers = ["Statement", "Link", "Date"]

    upperframe = []
    news_url = "https://news.google.com/news/rss"
    Client = urlopen(news_url)
    xml_page = Client.read()
    Client.close()
    soup_page = soup(xml_page, "xml")
    news_list = soup_page.findAll("item")

    frame = []
    Links = "Links.csv"
    f1 = open(Links, "a", encoding='utf-8')
    linkhead = ['Link']
    for news in news_list:
        texts = news.title.text
        lsts = warn_words
        for l in lsts:
            if l in texts:
                print(news.title.text)
                print(news.link.text)
                print("\n")

                date = find_date(news.link.text)
                upperframe = [news.title.text, news.link.text, date]
                frame.append(upperframe)
                links.append([news.link.text])
                break
    print(links)
    with open(filename, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(headers)
        csvwriter.writerows(frame)

    with open(Links, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(linkhead)
        csvwriter.writerows(links)

    for titleNews in frame:
        title = titleNews[0]
        # tokenization
        X_list = word_tokenize(text)
        Y_list = word_tokenize(Y)

        # sw contains the list of stopwords
        sw = stopwords.words('english')
        l1 = []
        l2 = []
        # remove stop words from the string
        X_set = {w for w in X_list if not w in sw}
        Y_set = {w for w in Y_list if not w in sw}
        # form a set containing keywords of both strings
        rvector = X_set.union(Y_set)
        for w in rvector:
            if w in X_set: l1.append(1)  # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0
        # cosine formula
        for i in range(len(rvector)):
            c += l1[i] * l2[i]
        if (((sum(l1) * sum(l2))**0.5) != 0):
            cosine = c / float((sum(l1) * sum(l2))**0.5)
        if (cosine > 0):
            total += 1
            print("Yes! Found similar sentence ", cosine)

    #twitter
    from nltk.corpus import stopwords

    #twitter credentials
    consumer_key = 'H9y4X6T2VD9X1yYC57JGuPDGF'
    consumer_secret_key = '2MH4iJ8bQ7awBUrtWbg8EF6fvmrMHrmTVYESME8zp3wl2vtyHb'
    #Reformat the keys and encode them
    key_secret = '{}:{}'.format(consumer_key,
                                consumer_secret_key).encode('ascii')
    #Transform from bytes to bytes that can be printed
    b64_encoded_key = base64.b64encode(key_secret)
    #Transform from bytes back into Unicode
    b64_encoded_key = b64_encoded_key.decode('ascii')

    #authentication
    base_url = 'https://api.twitter.com/'
    auth_url = '{}oauth2/token'.format(base_url)
    auth_headers = {
        'Authorization': 'Basic {}'.format(b64_encoded_key),
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
    }
    auth_data = {'grant_type': 'client_credentials'}
    auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)
    print(auth_resp.status_code)  #200 indicates successful authentication
    access_token = auth_resp.json()['access_token']  #auth response stored
    #print(warn_words)

    #warn_words = ['killed', 'police', 'brutality', 'systemic', 'racism', 'Covid']
    rows = []
    twt_total = 0
    for ip in warn_words:
        search_headers = {'Authorization': 'Bearer {}'.format(access_token)}

        search_params = {'q': ip, 'lang': 'eu', 'result_type': 'mixed'}

        search_url = 'https://api.twitter.com/1.1/search/tweets.json'
        search_resp = requests.get(search_url,
                                   headers=search_headers,
                                   params=search_params)
        # print(ip+" "+str(search_resp.status_code))
        search_data = search_resp.json()  # result of search in json format
        # print(search_data)
        with open('raw_tweets.json', 'a+',
                  encoding='utf-8') as f:  # Will be needed later
            # data = json.load(f.decode('utf8'))
            # temp=data['statuses']
            # temp.append(data)
            json.dump(search_data, f, ensure_ascii=False, indent=4
                      )  # file that has all tweets collected for each keyword
        with open('raw_tweets_temp.json', 'w', encoding='utf-8') as f:
            json.dump(
                search_data, f, ensure_ascii=False,
                indent=4)  # temp file that is overwritten for every keyword

        # print(ip+" "+str(len(search_data['statuses'])))
        for i in range(0, len(search_data['statuses'])):
            # print('tweet number',i+1,'=',search_data['statuses'][i])
            row = []
            row.append(search_data['statuses'][i]['id'])
            row.append(search_data['statuses'][i]['id_str'])
            row.append(ip)
            row.append(search_data['statuses'][i]['created_at'])
            row.append(search_data['statuses'][i]['text'])
            row.append(search_data['statuses'][i]['favorite_count'])
            row.append(search_data['statuses'][i]['retweet_count'])
            # print(row)
            rows.append(row)
    # print(rows)

    # fields = ["id","id_str","keyword","created_at","text","likes","retweeted","hashtags"]
    # with open('tweets.csv', 'w', encoding='utf-8') as f: #to collect tweets over time change permission to a+ and remove writerow fields
    #     csvwriter = csv.writer(f)
    #     csvwriter.writerow(fields)
    #     csvwriter.writerows(rows)

    cleaned_tweets = []
    for row in rows:
        # print(row[4])#is the text of tweet
        # extracting hashtags
        h = [s for s in row[4].split() if s.startswith('#')]
        # print(h)
        row.append(h)
        # add hashtag segmentation here later
        # forming a separate feature for cleaned tweets
        # cleaned tweets: don't have stop words, don't have hashtags URLS, Emojis, mentions
        s = p.clean(row[4]).lower()
        row.append(s)
        cleaned_tweets.append(s)
        s = word_tokenize(s)
        s = [i for i in s if i not in stopwords.words('english')]
        row.append(s)

    fields = [
        "id", "id_str", "keyword", "created_at", "tweet_text", "likes",
        "retweeted", "hashtags", "clean_text_str", "clean_text"
    ]
    with open(
            'tweets.csv', 'w', encoding='utf-8'
    ) as f:  # to collect tweets over time change permission to a+ and remove writerow fields
        csvwriter = csv.writer(f)
        csvwriter.writerow(fields)
        csvwriter.writerows(rows)

    print(cleaned_tweets)

    for i in cleaned_tweets:
        # Program to measure the similarity between
        # two sentences using cosine similarity. first sentence
        # is from query (text) and second is description

        # tokenization
        X_list = word_tokenize(text)
        Y_list = word_tokenize(i)

        # sw contains the list of stopwords
        sw = set(stopwords.words('english'))
        l1 = []
        l2 = []

        # remove stop words from the string
        X_set = {w for w in X_list if not w in sw}
        Y_set = {w for w in Y_list if not w in sw}

        # form a set containing keywords of both strings
        rvector = X_set.union(Y_set)
        for w in rvector:
            if w in X_set: l1.append(1)  # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula
        for i in range(len(rvector)):
            c += l1[i] * l2[i]
        if (((sum(l1) * sum(l2))**0.5) != 0):
            cosine = c / float((sum(l1) * sum(l2))**0.5)
        if (cosine > 0):
            twt_total += 1
            print("Yes! Found similar sentence ", cosine)
            #links.append([linkForItem])
    print("Twitter Total ", twt_total)

    return links, twt_total
Esempio n. 14
0
def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=False, author_blacklist=None):
    """Main process for metadata extraction.

    Args:
        filecontent: HTML code as string.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.

    Returns:
        A dict() containing the extracted metadata information or None.

    """
    # init
    if author_blacklist is None:
        author_blacklist = set()
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # to check: remove it and replace with author_blacklist in test case
    if metadata.author is not None and ' ' not in metadata.author:
        metadata.author = None
    # fix: try json-ld metadata and override
    try:
        metadata = extract_meta_json(tree, metadata)
    # todo: fix bugs in json_metadata.py
    except TypeError as err:
        LOGGER.warning('error in JSON metadata extraction: %s', err)
    # try with x-paths
    # title
    if metadata.title is None:
        metadata.title = extract_title(tree)
    # check author in blacklist
    if metadata.author is not None and len(author_blacklist) > 0:
        metadata.author = check_authors(metadata.author, author_blacklist)
    # author
    if metadata.author is None:
        metadata.author = extract_author(tree)
    # recheck author in blacklist
    if metadata.author is not None and len(author_blacklist) > 0:
        metadata.author = check_authors(metadata.author, author_blacklist)
    # url
    if metadata.url is None:
        metadata.url = extract_url(tree, default_url)
    # hostname
    if metadata.url is not None:
        metadata.hostname = extract_domain(metadata.url)
    # extract date with external module htmldate
    if date_config is None:
        # decide on fast mode
        if fastmode is False:
            date_config = HTMLDATE_CONFIG_EXTENSIVE
        else:
            date_config = HTMLDATE_CONFIG_FAST
    date_config['url'] = metadata.url
    metadata.date = find_date(tree, **date_config)
    # sitename
    if metadata.sitename is None:
        metadata.sitename = extract_sitename(tree)
    if metadata.sitename is not None:
        if metadata.sitename.startswith('@'):
            # scrap Twitter ID
            metadata.sitename = re.sub(r'^@', '', metadata.sitename)
        # capitalize
        try:
            if (
                '.' not in metadata.sitename
                and not metadata.sitename[0].isupper()
            ):
                metadata.sitename = metadata.sitename.title()
        # fix for empty name
        except IndexError as err:
            LOGGER.warning('error in sitename extraction: %s', err)
    # use URL
    elif metadata.url:
        mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata.url)
        if mymatch:
            metadata.sitename = mymatch.group(1)
    # categories
    if not metadata.categories:
        metadata.categories = extract_catstags('category', tree)
    # tags
    if not metadata.tags:
        metadata.tags = extract_catstags('tag', tree)
    # license
    metadata.license = extract_license(tree)
    # safety checks
    metadata.clean_and_trim()
    # return result
    return metadata
Esempio n. 15
0
def get_date(urlinput):
    return find_date(urlinput)
Esempio n. 16
0
def run_htmldate_fast(htmlstring):
    '''run htmldate on content'''
    result = find_date(htmlstring, original_date=True, extensive_search=False)
    return result
Esempio n. 17
0
def Get_Date_Created(url_input):
    try:
        date = find_date(url_input)
        return date
    except:
        return None
Esempio n. 18
0
	mycursor.execute("create TABEL scraped")


my_url='https://innovaccer.com/news/'


context = ssl._create_unverified_context()
uClient = urlopen(my_url, context=context)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html,"html.parser")

containers = page_soup.findAll("div",{"class":"col-lg-8"})

for container in containers:
	heading = container.h6.text

	description = container.p

	link = container.a["href"]

	date= htmldate.find_date(container.text)

	cursor.execute("Insert into scraped(heading,description,link,date) values(?, ?, ?, ?)",(heading, description, link, date))



	print( "heading :" + heading )
	print("link : " + str(my_url) + str(link)  )
	print("date : " , date)
Esempio n. 19
0
def extract_metadata(filecontent, default_url=None, date_config=None):
    """Main process for metadata extraction.

    Args:
        filecontent: HTML code as string.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().

    Returns:
        A dict() containing the extracted metadata information or None.

    """
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # correction: author not a name
    if metadata['author'] is not None:
        if ' ' not in metadata['author'] or metadata['author'].startswith(
                'http'):
            metadata['author'] = None
    # fix: try json-ld metadata and override
    metadata = extract_json(tree, metadata)
    # try with x-paths
    # title
    if metadata['title'] is None:
        metadata['title'] = extract_title(tree)
    # author
    if metadata['author'] is None:
        metadata['author'] = extract_author(tree)
    # url
    if metadata['url'] is None:
        metadata['url'] = extract_url(tree, default_url)
    # hostname
    if metadata['url'] is not None:
        metadata['hostname'] = extract_domain(metadata['url'])
    # extract date with external module htmldate
    if date_config is None:
        date_config = HTMLDATE_CONFIG
    date_config['url'] = metadata['url']
    try:
        metadata['date'] = find_date(tree, **date_config)
    # temporary fixes for htmldate bugs # todo: remove later
    except (TypeError, UnicodeError):
        pass
    # sitename
    if metadata['sitename'] is None:
        metadata['sitename'] = extract_sitename(tree)
    if metadata['sitename'] is not None:
        if metadata['sitename'].startswith('@'):
            # scrap Twitter ID
            metadata['sitename'] = re.sub(r'^@', '', metadata['sitename'])
        # capitalize
        try:
            if not '.' in metadata['sitename'] and not metadata['sitename'][
                    0].isupper():
                metadata['sitename'] = metadata['sitename'].title()
        # fix for empty name
        except IndexError:
            pass
    else:
        # use URL
        if metadata['url']:
            mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)',
                               metadata['url'])
            if mymatch:
                metadata['sitename'] = mymatch.group(1)
    # categories
    if not metadata['categories']:
        metadata['categories'] = extract_catstags('category', tree)
    # tags
    if not metadata['tags']:
        metadata['tags'] = extract_catstags('tags', tree)
    # license
    for element in tree.xpath('//a[@rel="license"]', ):
        if element.text is not None:
            metadata['license'] = trim(element.text)
            break
    # for safety: length check
    for key, value in metadata.items():
        if value is not None and len(value) > 10000:
            metadata[key] = value[:9999] + '…'
    # remove spaces and control characters
    for item in metadata:
        if metadata[item] is not None and isinstance(metadata[item], str):
            metadata[item] = line_processing(metadata[item])
    # return
    return metadata
Esempio n. 20
0
def extract_date(tree, url):
    '''Extract the date using external module htmldate'''
    docdate = find_date(tree, extensive_search=False, url=url)
    return docdate
Esempio n. 21
0
def get_date(data):
  #this function gets the last modified date of a website
     x = find_date(data)
     print(x)   
Esempio n. 22
0
def getDate(htmldoc):
    mytree = html.fromstring(htmldoc)

    return find_date(mytree, outputformat='%Y-%m-%d %H:%M')
Esempio n. 23
0
def SearchAndSave(
           browser, # logged-in with specific user behavior
           user, #first_hon_click, first_non_click, time_click, readability_click, no_click, average_click
           data='../data/queries2019_400.csv', # path to queries
           max_num_res=8, # number of results
           ):
    
    # global varaible in order to users search same queries in the same order. Incremented every day
    global RANDOM_STATE    
    # get all queries and choose which to search
    queries_all = pd.read_csv(data, sep=',')  
    queries_today = np.asarray(queries_all.iloc[RANDOM_STATE:RANDOM_STATE + 2]).flatten() # CHANGE + 2
    random.Random(RANDOM_STATE).shuffle(queries_today)
    #parameters and variables
    dict_results = {  "QUERY":[],
                      "RANK": [],
                      "URL": [],
                      'HON': [],
                      'SMOG': [],
                      'PUBLICATION_DATE': [],
                      'CLICKS&TIME': [],
                             } 
    WINDOW_SIZE = 540 # size of scrolling window
    READ_TIME = 300 # this value should be 60-300 # CHANGE 300
    WAIT_TIME = 10 # this value should be < 60 # CHANGE 10
    CARRY_OVER_TIME= 660
    URL_PROBAB = [0.32, 0.25, 0.18, 0.11, 0.07, 0.04, 0.02, 0.01 ] # average statistic on clickimg google results
    
    # search for today queries
    for query in queries_today:  
        # start counting time for one search, which should be at least 11 min 
        start_time = time.time()
        # find all elements provided on google search results 
        browser.get("https://google.com") 
        browser.find_element_by_xpath("//input[@name='q']").send_keys(query + Keys.ENTER)
        # wait to upload HON-labels
        time.sleep(WAIT_TIME) 
        browser.execute_script("window.scrollTo(0, window.scrollY + {0})".format(2 * WINDOW_SIZE)) 
        # get number of google results 
        num_res_google = browser.find_element_by_xpath('//*[@id="result-stats"]').text
        try:
            num_res_int = int(re.search('About (.*) results', num_res_google).group(1).replace(',', ''))
        except:
            try:
                num_res_int = int(re.search('(.*) results', num_res_google).group(1).replace(',', ''))
            except:
                pass
        # get time of the start query search
        start_query_time = datetime.now().strftime("%H:%M:%S")   
        # reset variables
        num_res=0
        search_results = []
        # get first 8 URLs and HON-labels from the SERP
        elements = browser.find_elements_by_class_name('rc')
        try:
            first_element = browser.find_element_by_css_selector("div[class='g mnr-c g-blk']")
            temp1 = first_element.find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href')
            temp2 = first_element.find_element_by_css_selector('div[target="_blank"]').get_attribute('title')
            try:
                temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text
                temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d')
            except:
                try:
                    temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text
                    temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15'
                except:
                    temp3 = None   
            search_results.append([temp1, temp2, temp3])
            num_res+=1
        except:
            try:
                first_element = browser.find_elements_by_class_name('g')[0]
                temp1 = first_element.find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href')
                temp2 = first_element.find_element_by_css_selector('div[target="_blank"]').get_attribute('title')
                try:
                    temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text
                    temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d')
                except:
                    try:
                        temp3 = first_element.find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text
                        temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15'
                    except:
                        temp3 = None   
                search_results.append([temp1, temp2, temp3])
                num_res+=1
            except:
                pass
        temp = num_res
        try:
            related_questions = browser.find_element_by_css_selector("div[class='g kno-kp mnr-c g-blk']")
            temp += len(related_questions.find_elements_by_css_selector("div[class='related-question-pair']"))
        except:
            pass
        for j in range(temp, len(elements)):
            temp1 = elements[j].find_element_by_css_selector("div[class='r']").find_element_by_css_selector('a').get_attribute('href')
            temp2 = elements[j].find_element_by_css_selector("div[class='r']").find_element_by_css_selector('div[target="_blank"]').get_attribute('title')
            try:
                temp3 = elements[j].find_element_by_css_selector("div[class='s']").find_element_by_css_selector("span[class='f']").text
                temp3 = datetime.strptime(temp3, '%b %d, %Y -').strftime('%Y-%m-%d')
            except:
                try:
                    temp3 = elements[j].find_element_by_css_selector("div[class='s']").find_element_by_css_selector("div[class='dhIWPd f']").text
                    temp3 = re.search('- (.*) - C', temp3).group(1)+'-06-15'
                except:
                    temp3 = None  
            search_results.append([temp1, temp2, temp3])
            num_res+=1       
            if num_res==max_num_res:
                break     
        # get user-agent of the browser 
        headers = requests.utils.default_headers()
        headers['User-Agent'] = browser.execute_script("return navigator.userAgent;")
        # get SMOG grades for all URLs and  publication date       
        for i in range(len(search_results)):
            if search_results[i][0][-3:]=='pdf':
                # extract publication date using htmldate library 
                try:
                    if search_results[i][2] == None:
                        search_results[i][2] == '1998-09-04'     
                    # extract SMOG-grade
                    urlretrieve(search_results[i][0], '../../ResultsISU/temp.pdf')
                    #Write a for-loop to open many files (leave a comment if you'd like to learn how).
                    filename = '../../ResultsISU/temp.pdf'
                    #open allows you to read the file.
                    pdfFileObj = open(filename,'rb')
                    #The pdfReader variable is a readable object that will be parsed.
                    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
                    #Discerning the number of pages will allow us to parse through all the pages.
                    num_pages = pdfReader.numPages
                    count = 0
                    full_text = ''
                    #The while loop will read each page.
                    while count < num_pages:
                        pageObj = pdfReader.getPage(count)
                        count +=1
                        full_text += pageObj.extractText() 
                    smog_grade = textstat.smog_index(full_text) 
                except:
                    smog_grade = 0.0
            else:
                # extract publication date using htmldate library 
                if search_results[i][2] == None:
                    search_results[i][2] = find_date(search_results[i][0]) 
                # extract SMOG-grade
                try:
                    response = requests.get(search_results[i][0], headers=headers)
                    paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
                    full_text = ''
                    for paragraph in paragraphs:
                        if not paragraph.is_boilerplate:
                            full_text += '\n' + paragraph.text
                    if full_text=='' and '.'.join(tldextract.extract(search_results[i][0])) != 'www.youtube.com':
                        for paragraph in paragraphs:
                            if not paragraph.is_heading:
                                full_text += '\n' + paragraph.text
                    smog_grade = textstat.smog_index(full_text) 
                except:
                    try:
                        response = requests.get(search_results[i][0], verify=False, headers=headers)
                        paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
                        full_text = ''
                        for paragraph in paragraphs:
                            if not paragraph.is_boilerplate:
                                full_text += '\n' + paragraph.text
                        if full_text=='' and '.'.join(tldextract.extract(search_results[i][0])) != 'www.youtube.com':
                            for paragraph in paragraphs:
                                if not paragraph.is_heading:
                                    full_text += '\n' + paragraph.text
                        smog_grade = textstat.smog_index(full_text) 
                    except:
                        smog_grade = 0.0
            # append SMOG and results
            if smog_grade > 0.0:
                search_results[i].append(smog_grade)
            else:
                search_results[i].append(100)
             # edit  PUBLICATION DATE results
            if search_results[i][2]  == None or search_results[i][2]  == '2020-01-01' or search_results[i][2]  == '2020-01-24': 
                search_results[i][2] = '1998-09-04' 
        # get clicks
        clicked_urls =  [''] * len(search_results)
        
##########################################################################################################################
        
######## ISU with first HON-certified result preferences  
        if user == 'first_hon_click':
            try:
                # get index and then url of wanted result
                ind = np.where(np.array(search_results)=='HONcode certified')[0][0]
                first_hon_url = search_results[ind][0]                
                first_hon_element = browser.find_element_by_xpath('//a[@href="'+first_hon_url+'"]')
                # click on first HON-certified URL
                first_hon_element.click()
                clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                # scrolling the page
                ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                # after spending READ_TIME on website go to google.com
                browser.get("https://google.com")          
            except:
                try:
                    # get index and then url of wanted result
                    ind = np.where(np.array(search_results)=='HONcode certified')[0][0]
                    first_hon_url = search_results[ind][0]                    
                    first_hon_element = browser.find_element_by_xpath('//a[@href="'+first_hon_url+'"]')
                    # click on first HON-certified URL
                    browser.execute_script("arguments[0].click();", first_hon_element)
                    clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                    time.sleep(5)
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com")         
                except:
                    # if there is no HON-certified result - DO NOTHING
                    browser.get("https://google.com")  

######## ISU with first NON-certified result preferences      
        if user == 'first_non_click':
            try:
                # get index and then url of wanted result
                ind = np.where(np.array(search_results)=='')[0][0]
                first_non_url = search_results[ind][0]   
                first_non_element = browser.find_element_by_xpath('//a[@href="'+first_non_url+'"]')
                # click on first NON HON-certified URL
                first_non_element.click()
                clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                # scrolling the page
                ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                # after spending READ_TIME on website go to google.com
                browser.get("https://google.com")          
            except:
                try:
                    # get index and then url of wanted result
                    ind = np.where(np.array(search_results)=='')[0][0]
                    first_non_url = search_results[ind][0]   
                    first_non_element = browser.find_element_by_xpath('//a[@href="'+first_non_url+'"]')
                    # click on first HON-certified URL
                    browser.execute_script("arguments[0].click();", first_non_element)
                    clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                    time.sleep(5)
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com")         
                except:
                    # if there is no HON-certified result - DO NOTHING
                    browser.get("https://google.com")  
         
######## ISU with first HON-certified result preferences and complex behavior 
        if user == 'complex_hon_click':
            # search for the first three URLs
            is_found = False
            for ind in range(3): 
                if search_results[ind][1]  =='HONcode certified':
                         
                    try:
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        click_element.click()
                        clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                        # scrolling the page
                        ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                        # after spending READ_TIME on website go to google.com
                        browser.get("https://google.com")        
                    except:
                        try:
                            click_url = search_results[ind][0]
                            click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                            browser.execute_script("arguments[0].click();", click_element)
                            clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                            # scrolling the page
                            ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                            # after spending READ_TIME on website go to google.com
                            browser.get("https://google.com")            
                        except:
                            pass  
                    is_found = True
                    break  
                    
                elif search_results[ind][1]  =='':
                    
                    try:
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        click_element.click()
                        clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                        #do not scroll the page
                        time.sleep(WAIT_TIME) 
                        browser.back() 
                    except:
                        try:
                            click_url = search_results[ind][0]
                            click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                            browser.execute_script("arguments[0].click();", click_element)
                            clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                            #do not scroll the page
                            time.sleep(WAIT_TIME) 
                            browser.back() 
                        except:
                            pass

                    
                time.sleep(WAIT_TIME / 2) 
                    
            if not is_found:

                try:
                    # get index and then url of wanted result
                    ind = np.where(np.array(search_results)=='HONcode certified')[0][0]
                    click_url = search_results[ind][0]
                    click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                    click_element.click()
                    clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S"))
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com") 
                except:
                    try:
                        # get index and then url of wanted result
                        ind = np.where(np.array(search_results)=='HONcode certified')[0][0]
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        browser.execute_script("arguments[0].click();", click_element)
                        clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S"))
                        # scrolling the page
                        ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                        # after spending READ_TIME on website go to google.com
                        browser.get("https://google.com") 
                    except:
                        # if there is no HON-certified result - DO NOTHING
                        browser.get("https://google.com")        

            
######## ISU with first NON-certified result preferences and complex behavior   
        if user == 'complex_non_click':
            # search for the first three URLs
            is_found = False
            for ind in range(3):
                if search_results[ind][1]  =='':
                    
                    try:
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        click_element.click()
                        clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                        # scrolling the page
                        ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                        # after spending READ_TIME on website go to google.com
                        browser.get("https://google.com")   
                    except:
                        try:
                            click_url = search_results[ind][0]
                            click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                            browser.execute_script("arguments[0].click();", click_element)
                            clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                            # scrolling the page
                            ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                            # after spending READ_TIME on website go to google.com
                            browser.get("https://google.com")   
                        except:
                            pass       
                    is_found = True
                    break  
                elif search_results[ind][1]  =='HONcode certified':
                    
                    try:
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        click_element.click()
                        clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                        #do not scroll the page
                        time.sleep(WAIT_TIME) 
                        browser.back()  
                    except:
                        try:
                            click_url = search_results[ind][0]
                            click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                            browser.execute_script("arguments[0].click();", click_element)
                            clicked_urls[ind] = 'clicked{0}&returned @ {1}'.format(ind, datetime.now().strftime("%H:%M:%S"))
                            #do not scroll the page
                            time.sleep(WAIT_TIME) 
                            browser.back()  
                        except:
                            pass
                    
                time.sleep(WAIT_TIME / 2) 
                    
            if not is_found:
            
                try:     
                    # get index and then url of wanted result
                    ind = np.where(np.array(search_results)=='')[0][0]
                    click_url = search_results[ind][0]               
                    click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                    click_element.click()
                    clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S"))
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com") 
                except:
                    try:
                        # get index and then url of wanted result
                        ind = np.where(np.array(search_results)=='')[0][0]
                        click_url = search_results[ind][0]
                        click_element = browser.find_element_by_xpath('//a[@href="' + click_url + '"]')
                        browser.execute_script("arguments[0].click();", click_element)
                        clicked_urls[ind] = 'clicked{0} @ {1}'.format(ind+1, datetime.now().strftime("%H:%M:%S"))
                        # scrolling the page
                        ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                        # after spending READ_TIME on website go to google.com
                        browser.get("https://google.com") 
                    except:
                        # if there is no non-HON-certified result - DO NOTHING
                        browser.get("https://google.com")
  
                 
######## ISU with clicking based on the average statistic user behavior   
        if user == 'average_click':    
            
            try:
                # get index and then url of wanted result      
                ind = int(np.random.choice(len(search_results), 1, p=URL_PROBAB))
                average_url = search_results[ind][0]
                average_element = browser.find_element_by_xpath('//a[@href="' + average_url + '"]')
                # click on average (according to given stistic) URL
                average_element.click()
                clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                # then scrolling the page
                ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                # after spending READ_TIME on website go to google.com
                browser.get("https://google.com")
            except:
                try:
                    # get index and then url of wanted result      
                    ind = int(np.random.choice(len(search_results), 1, p=URL_PROBAB))
                    average_url = search_results[ind][0]
                    average_element = browser.find_element_by_xpath('//a[@href="' + average_url + '"]')
                    # click on average (according to given stistic) URL
                    browser.execute_script("arguments[0].click();", average_statistic_element)
                    clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                    # then scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com")
                except:
                    # if there is an error - DO NOTHING
                    browser.get("https://google.com")
 
        
######## ISU without clicking/browsing, only search history    
        if user == 'no_click':
            # just my favourite DO NOTHING 
            time.sleep(WAIT_TIME) 
            browser.get("https://google.com")
                       
            
######## ISU without clicking/browsing, only search history    
        if user == 'time_click':
            
            try:
                # get index and then url of wanted result 
                best_time = max(map(lambda x: datetime.strptime(x[3], '%Y-%m-%d') , search_results)) 
                ind = np.where(np.array(search_results)==str(best_time.strftime('%Y-%m-%d')))[0][0]
                best_time_url = search_results[ind][0]
                best_time_element = browser.find_element_by_xpath('//a[@href="'+best_time_url+'"]')
                # click on the earliest publication date URL
                best_time_element.click()
                clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                # scrolling the page
                ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                # after spending READ_TIME on website go to google.com
                browser.get("https://google.com")          
            except:
                try:
                    # get index and then url of wanted result 
                    best_time = max(map(lambda x: datetime.strptime(x[3], '%Y-%m-%d') , search_results)) 
                    ind = np.where(np.array(search_results)==str(best_time.strftime('%Y-%m-%d')))[0][0]
                    best_time_url = search_results[ind][0]
                    best_time_element = browser.find_element_by_xpath('//a[@href="'+best_time_url+'"]')
                    # click on the earliest publication date URL
                    browser.execute_script("arguments[0].click();", best_time_element)
                    clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com")         
                except:
                    # if there is an error - DO NOTHING
                    browser.get("https://google.com")  

######## ISU with easy-to-read preferences     
        if user == 'readability_click':
            
            try:
                # get index and then url of wanted result
                best_smog_grade = min(map(lambda x: x[2] , search_results)) 
                ind = np.where(np.array(search_results)==str(best_smog_grade))[0][0]
                best_smog_url = search_results[ind][0]
                best_smog_element = browser.find_element_by_xpath('//a[@href="'+best_smog_url+'"]')
                # click on the lowest SMOG-grade URL
                best_smog_element.click()
                clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                # then scrolling the page
                ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                # after spending READ_TIME on website go to google.com
                browser.get("https://google.com")          
            except:
                try:
                    # get index and then url of wanted result
                    best_smog_grade = min(map(lambda x: x[2] , search_results)) 
                    ind = np.where(np.array(search_results)==str(best_smog_grade))[0][0]
                    best_smog_url = search_results[ind][0]
                    best_smog_element = browser.find_element_by_xpath('//a[@href="'+best_smog_url+'"]')
                    # click on the lowest SMOG-grade URL
                    browser.execute_script("arguments[0].click();", best_smog_element)
                    clicked_urls[ind] = 'clicked @ {0}'.format(datetime.now().strftime("%H:%M:%S") )
                    # scrolling the page
                    ScrollAndRead(browser, WINDOW_SIZE, READ_TIME)
                    # after spending READ_TIME on website go to google.com
                    browser.get("https://google.com")         
                except:
                    # if there is an error - DO NOTHING
                    browser.get("https://google.com")
            
########################################################################################################################## 
        # get time of the end query search
        end_query_time = datetime.now().strftime("%H:%M:%S") 
        # save results in a dict
        dict_results["QUERY"].extend([query, 'start @ {}'.format(start_query_time), 'end @ {}'.format(end_query_time), 'results # {}'.format(num_res_int), '', '', '', '' ] ) 
        dict_results["RANK"].extend([1,2,3,4,5,6,7,8]) 
        dict_results["URL"].extend(np.array(search_results)[:,0]) 
        dict_results["HON"].extend(np.array(search_results)[:,1]) 
        dict_results["SMOG"].extend(np.array(search_results)[:,3]) 
        dict_results["PUBLICATION_DATE"].extend(np.array(search_results)[:,2])    
        dict_results["CLICKS&TIME"].extend(clicked_urls) 
        # wait 11 minutes to avoid a carry-over effect
        end_time = time.time()
        session_time = end_time - start_time 
        CARRY_OVER_TIME = 660 # CHANGE 100
        if session_time < CARRY_OVER_TIME:
            time.sleep(CARRY_OVER_TIME - session_time)        

########################################################################################################################## 
    
    # create pandas DataFrame with all results
    df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_results.items() ]))
    # create folder with the date
    day_today = datetime.today().strftime("%b-%d-%Y")
    Path('../../ResultsISU/{0}'.format(day_today)).mkdir(parents=True, exist_ok=True)
    df.to_csv('../../ResultsISU/{0}/{1}.csv'.format(day_today,user), index=False) #CHANGE
    
    # incrementing global variable
    RANDOM_STATE+=1