Beispiel #1
0
def get_news_tags():
    text_data = []
    r = read_hdf5(files["news_store"], "reddit")
    t = read_hdf5(files["news_store"], "twitter_conversations")
    n = read_hdf5(files["news_store"], "news_articles")
    text_data.extend(r.text.values)
    text_data.extend(r.title.values)
    text_data.extend(t.text.values)
    text_data.extend(n.text.values)
    text_data.extend(n.title.values)

    st = time.process_time()
    all_ = {}
    for text in text_data:
        tags = tag_str(text)
        for tag in tags:
            if tag in all_.keys():
                all_[tag] += 1
            else:
                all_[tag] = 1

    pprint(all_)
    print(f"Took {time.process_time() - st} sek to complete")
    df_tags = pd.DataFrame.from_dict(orient="index",
                                     columns=["mentions"],
                                     data=all_)
    print(df_tags.sort_values(by="mentions", ascending=False, inplace=True))
    df_tags.to_csv("facts/tags_gotten2.csv")
Beispiel #2
0
def check_words():

    #twitter = read_hdf5(files["news_store"], "twitter_conversations")
    reddit = read_hdf5(files["news_store"], "reddit")
    words = []
    for index, row in reddit.iterrows():
        score = row["score"]
        title = row["title"]
        text = row["text"]
        all_text = title + " " + text
        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', all_text)

        for char in special_chars_str:
            all_text = all_text.replace(char, "")
        all_text_tokenized = tokenize.word_tokenize(all_text)
        corpus.wordnet.words()
        corp = set(corpus.words.words() + corpus.brown.words())
        english_vocab = set(w.lower() for w in corp)
        sample_lower = set(w.lower() for w in all_text_tokenized
                           if w.lower().isalpha())
        unusual = sample_lower.difference(english_vocab)
        words += unusual
        votes, percent = score.split(":")
        print(votes, all_text, "______", urls)
        print("______________________________")
    print(words)
    counted = collections.Counter(words)
    print(counted)

    with open(files["reddit_comments"], "br") as f:
        d = pkl.load(f)
    pprint(d)
    articles = read_hdf5(files["news_store"], "news_articles")
    print(reddit, articles)
Beispiel #3
0
    def __init__(self):
        #del_hdf5_key(files["news_store"], "reddit")

        try:
            if len(read_hdf5(files["news_store"], "reddit")) <= 1:
                error("Too short new")
                self.df_subs = pd.DataFrame(columns=[
                    "id", "text", "title", "created", "score", "comments",
                    "comment_num"
                ])
            else:
                self.old_save = True
                self.df_subs = read_hdf5(files["news_store"], "reddit")
        except KeyError as e:
            error(e)
            self.df_subs = pd.DataFrame(columns=[
                "id", "text", "title", "created", "score", "comments",
                "comment_num"
            ])
        try:
            self.ids = list(read_hdf5(files["news_store"], "reddit").id.values)
        except KeyError as e:
            error(e)
            self.ids = list()
        user_agent = "Windows: News Analyser :v1.0 (by /u/ludvig127)"
        self.red = praw.Reddit(
            user_agent=user_agent,
            client_id=read_yaml(files["auth"])["reddit_client_id"],
            client_secret=read_yaml(files["auth"])["reddit_client_secret"])
Beispiel #4
0
    def __init__(self):
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
                     "(KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.44"
        self.config = Config()
        self.config.browser_user_agent = user_agent
        self.config.memoize_articles = True
        self.config.verbose = True
        self.config.language = "en"

        ignore_already_gotten = True
        #nono_host_words = ["arabic", "espanol", "latino", "latina"]
        self.df_art_base = pd.DataFrame(
            columns=["link", "text", "title", "created", "keywords", "author"])
        try:
            self.newssites = ([
                "https://www.cnbc.com", 'http://cnn.com',
                "http://www.huffingtonpost.com", "http://www.nytimes.com",
                "http://news.bbc.co.uk/", "http://www.theguardian.com/"
            ])

        except Exception as e:
            error(str(e))
        self.urls = {}
        try:
            if len(read_hdf5(files["news_store"], "news_articles")) <= 1:
                error("Too short new")
                del_hdf5_key(["news_store"], "news_articles")
                self.df_art = pd.DataFrame(columns=[
                    "link", "text", "title", "created", "keywords", "author"
                ])
            else:
                self.df_art = read_hdf5(files["news_store"], "news_articles")
        except KeyError as e:
            error(e)
            self.df_art = pd.DataFrame(columns=[
                "link", "text", "title", "created", "keywords", "author"
            ])
        except TypeError:
            error("Too short new")
            del_hdf5_key(files["news_store"], "news_articles")
            self.df_art = pd.DataFrame(columns=[
                "link", "text", "title", "created", "keywords", "author"
            ])
        try:
            self.urls["gotten"] = read_hdf5(files["news_store"],
                                            "news_articles").link.values
        except (KeyError, AttributeError) as e:
            error(e)
            self.urls["gotten"] = []
Beispiel #5
0
    def __init__(self):
        twitter_consumer_key = read_yaml(files["auth"])["twitter_consumer_key"]
        twitter_secret_consumer_key = read_yaml(
            files["auth"])["twitter_secret_consumer_key"]
        twitter_access_token_key = read_yaml(
            files["auth"])["twitter_access_token_key"]
        twitter_secret_access_token_key = read_yaml(
            files["auth"])["twitter_secret_access_token_key"]

        auth = tweepy.OAuthHandler(twitter_consumer_key,
                                   twitter_secret_consumer_key)
        auth.set_access_token(twitter_access_token_key,
                              twitter_secret_access_token_key)
        self.ids = []
        self.store_dir = files["news_store"]

        self.api = tweepy.API(auth)

        try:
            self.api.verify_credentials()
            log("Authentication OK")

        except:
            error("Error during authentication")
            log("Error during authentication")
        try:
            data = read_hdf5(files["news_store"], key="twitter_conversations")
            ids = [int(i) for i in data.id.values]
            since_id = max(ids)
        except Exception as e:
            since_id = 1252311068950638593
            error(e)
        print(since_id)
        self.get_conversations(since_id=since_id)
Beispiel #6
0
def tags_history():
    """Create dataframe of tags with their dates formated with the tag as index and a dict with date(hour) and number"""
    dold = read_hdf5(files["news_store"], "news_articles")
    d = read_hdf5(files["news_store"],
                  "news_articles").set_index(keys="created")
    news_created = (d.drop("None").dropna().sort_values(by="created"))
    dold["created"] = dold["created"].apply(date_to_posix)
    print(dold.dropna().set_index(keys="created").sort_values(by="created"))
    sleep(10000)
    dates = news_created.index.values
    print(dates, type(dates))
    stamps = [
        time.mktime(ciso8601.parse_datetime(t).timetuple()) for t in dates
        if type(t) is type("")
    ]
    print(stamps)
    for date, stamp in zip(dates, stamps):
        print(f"{date} ; {stamp}")
    sleep(10000)
    twe = read_hdf5(files["news_store"], "twitter_conversations")
    red = read_hdf5(files["news_store"], "reddit").set_index(keys="created")
    print("articles", d)
    print("twitter", twe)
    print("reddit", red)
    texts = 0
    all = {}
    text_list = []

    [text_list.append(t) for t in d.text.values]
    [text_list.append(t) for t in twe.text.values]
    [text_list.append(t) for t in red.selftext.values]

    for p in text_list:
        texts += 1
        print(p)
        tags = find_tags_from_str(p, as_str=False)
        print("TAGS: ", tags)
        for tag in tags:
            if tag in all.keys():
                all[tag] += 1
            else:
                all[tag] = 1
        # from finnhub import Stream
        # Stream()
    pprint(all)
Beispiel #7
0
def make_all_timestamp():
    dold = read_hdf5(files["news_store"],
                     "twitter_conversations").reset_index()
    #dold["created_date"] = dold["created_date"].apply(date_to_posix)
    dold = dold.dropna().sort_values(by="created_date")
    print(dold)
    print(dold.loc[dold.index.values[0]])
    update_hdf5(files["news_store"],
                "twitter_conversations",
                dataframe=dold.reset_index(),
                append=False)
Beispiel #8
0
def to_backup(file, key, append=True):
    backup_df = read_hdf5(file, key)
    print(backup_df)
    update_hdf5(files["backup"], key, append=append, dataframe=backup_df)
Beispiel #9
0
def dicts_to_df(dicts):
    columns_list = ["tags_combined", "time_start", "time_stop"]
    [columns_list.extend(t) for t in return_tags()]
    df = pd.DataFrame(columns=columns_list)
    for dict in dicts:
        print(dict)
        df = df.append(dict, ignore_index=True).fillna(0)
    print(df)
    return df


if __name__ == "__main__":
    sleep(1000)
    get_news_tags()
    tag = read_hdf5(files["tags_df"], "twitter").AMD.values
    comb = read_hdf5(files["tags_df"], "twitter").tags_combined.values

    for t, c in zip(tag, comb):
        print(t / c)

    sleep(400)
    pprint(return_tags())
    dfs = split_hourly(read_hdf5(files["news_store"], "twitter_conversations"))
    tags_dicts = []
    for df in dfs:
        d = tags_from_df(df, title=False)
        tags_dicts.append(d)
    df_interval = dicts_to_df(tags_dicts)
    update_hdf5(files["tags_df"],
                "twitter",
Beispiel #10
0
    def get_conversations(self, since_id=0, tags=(), items_batch=25):

        if not tags:
            tags = [t for t in twitter_tags() if len(t) > 1]
        log(f"Getting raw data for {tags}, since_id={since_id}")
        search_str = ""
        try:
            data = read_hdf5(files["news_store"], key="twitter_conversations")
            ids = [int(i) for i in data.id.values]
            self.ids.extend(ids)
            since_id = max(ids)
        except Exception as e:
            since_id = 1252311068950638593
            error(e)
        if not since_id:
            try:
                ids = read_hdf5(self.store_dir,
                                "twitter_conversations")["id"].values
                ids_ = True
            except (AttributeError, KeyError, TypeError) as e:
                error(str(e))
                ids_ = False
            if not ids_:
                since_id = 0
            else:
                since_id = max(ids)
        df = pd.DataFrame(columns=[
            "created", "id", "retweets", "text", "user_id", "favorits",
            "user_followers"
        ])
        tags = [tag for tag in tags if type(tag) is type(str())]

        for t in tags[:40]:
            search_str += f"{t} OR "
        if search_str[-3:] == "OR ":
            search_str = search_str[:-4]
        search_str += " -filter:retweets"
        print(search_str)

        for i in range(5):
            try:
                since_id = max(self.ids)
                if since_id:
                    cursor = tweepy.Cursor(self.api.search,
                                           q=search_str,
                                           lang="en",
                                           full_text=True,
                                           since_id=since_id,
                                           wait_on_rate_limit=True,
                                           tweet_mode="extended",
                                           wait_on_rate_limit_notify=True,
                                           retry_delay=5).items(items_batch)
                else:
                    cursor = tweepy.Cursor(self.api.search,
                                           q=search_str,
                                           lang="en",
                                           full_text=True,
                                           wait_on_rate_limit=True,
                                           tweet_mode="extended",
                                           wait_on_rate_limit_notify=True,
                                           retry_delay=5).items(items_batch)
                for tweet in cursor:
                    j = tweet._json
                    if j["id"] in self.ids:
                        print(str(j["id"]) + " already in wtf")
                    else:
                        created = date_to_posix(str(j["created_at"]),
                                                list=False)

                        if created is not None:
                            #print(j["created_at"])
                            #date = datetime.datetime.strptime(j["created_at"], "%a %b %H %M %S %z %-y").replace(tzinfo=timezone.utc).timestamp()
                            #print(date)

                            data = {
                                "created": float(created),
                                "id": str(j["id"]),
                                "retweets": str(j["retweet_count"]),
                                "text": str(j["full_text"]),
                                "user_id": str(j["user"]["id"]),
                                "favorits": str(j["favorite_count"]),
                                "user_followers":
                                str(j["user"]["followers_count"])
                            }
                            #tag_str(j["full_text"], as_str=True)

                            self.ids.append(int(j["id"]))
                            if len(data["text"]) >= 333:
                                print(data["text"], "left out")
                            else:
                                if len(data["text"]) > 25:
                                    df = df.append(data, ignore_index=True)
                        else:
                            print(j)

                df.set_index("created", inplace=True)
                print(df)

                self.ids.extend([int(v) for v in df.id.values])
                update_hdf5(files["news_store"],
                            key="twitter_conversations",
                            dataframe=df,
                            append=True)
                df = pd.DataFrame(columns=[
                    "created", "id", "retweets", "text", "user_id", "favorits",
                    "user_followers"
                ])

            except FileNotFoundError as e:
                error(str(e))
Beispiel #11
0
#print(d)
#update_hdf5("news_data_new.h5", "news_articles", dataframe=d, append=False)

#print(read_hdf5(["news_store"], "twitter_conversations"))

if __name__ == "__main__":
    loop = True
    runs = 0
    while loop:

        #print(read_hdf5(files["news_store"], "twitter_conversations").columns)

        #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").drop("tag", inplace=False, axis=1), append=False)
        #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").set_index("created", inplace=False), append=False)

        print(read_hdf5(files["news_store"], "twitter_conversations"))
        runs += 1
        TwitterNews()
        #print(read_hdf5(files["news_store"], "twitter_conversations").tag)

        try:
            pass
            Reddit.scrape_subreddit(Reddit())
        except Exception as e:
            error(e)
            pass
        #del_hdf5_key(files["news_store"], "news_articles")
        NewsArticles.gather_different(NewsArticles())
        print(read_hdf5(files["news_store"], "twitter_conversations"))
        print(read_hdf5(files["news_store"], "news_articles"))
        print(read_hdf5(files["news_store"], "reddit"))
Beispiel #12
0
from __init__ import read_hdf5, files, find_tags_from_str
from pprint import pprint
import pandas as pd
from time import sleep

reddit_df = read_hdf5(files["news_store"], "reddit")
print(reddit_df)
print(reddit_df.columns)

print(reddit_df.selftext.values)
for t in reddit_df.title.values:
    if len(t) > 15:
        print(t)
        print("tags; ", find_tags_from_str(t, as_str=True))