def on_message(self, message): """Get message, add to dataframes and each 15 updates save it to drive""" mess = ast.literal_eval(message) if mess["type"] == "trade": print(message) data = mess["data"][0] rel_sym = data["s"] price = data["p"] time = data["t"] volume = data["v"] info_dict = { "time": float(time), "price": float(price), "volume": float(volume) } self.dataframe_dict[rel_sym] = self.dataframe_dict[rel_sym].append( info_dict, ignore_index=True) if len(self.dataframe_dict[rel_sym].index.values) % 357 == 0: print(self.dataframe_dict[rel_sym]) # Time is correct but does not show // # print(f"first_time: {self.dataframe_dict[rel_sym]['time'].values[0]}, last: {self.dataframe_dict[rel_sym]['time'].values[-1]}") file = files["streamed_data_finnhub"] update_hdf5(file, key=rel_sym, dataframe=self.dataframe_dict[rel_sym]) self.dataframe_dict[rel_sym] = self.dataframe_dict[ rel_sym].iloc[0:0]
def make_all_timestamp(): dold = read_hdf5(files["news_store"], "twitter_conversations").reset_index() #dold["created_date"] = dold["created_date"].apply(date_to_posix) dold = dold.dropna().sort_values(by="created_date") print(dold) print(dold.loc[dold.index.values[0]]) update_hdf5(files["news_store"], "twitter_conversations", dataframe=dold.reset_index(), append=False)
def parse_article(self, url, save_to_self=True): art = Article(url) art.download() art.parse() dic_temp = { "link": str(art.url), "text": str(art.text), "title": str(art.title), "created": str(art.publish_date), "keywords": str(art.keywords), "author": str(art.authors) } df = self.df_art_base.append(dic_temp, ignore_index=True) if art.url not in self.urls["gotten"]: update_hdf5(files["news_store"], "news_articles", dataframe=df) if save_to_self: self.df_art = self.df_art.append(dic_temp, ignore_index=True) else: return dic_temp
def scrape_subreddit(self, added_subreddits=(), items=10, return_items=False, only_added=False): if only_added: subreddits_ = added_subreddits else: subreddits_ = subreddits(added=added_subreddits) if items is None: items = 50 subreddits_gotten = [] for subreddit in subreddits_: hot_posts = self.red.subreddit(subreddit).hot(limit=items) subreddits_gotten.append((hot_posts, subreddit)) comments = {} for sub in subreddits_gotten: comments[sub] = self.parse_submission_obj(sub) try: with open(files["reddit_comments"], "rb") as f: old = pickle.load(f) except (FileNotFoundError, EOFError) as e: old = {} error(e) with open(files["reddit_comments"], "wb") as f: if old is not None: pickle.dump(comments.update(old), f) else: pickle.dump(comments, f) print(self.df_subs) update_hdf5(files["news_store"], "reddit", dataframe=self.df_subs, mode="a", append=False) if return_items: return self.df_subs, comments
def to_backup(file, key, append=True): backup_df = read_hdf5(file, key) print(backup_df) update_hdf5(files["backup"], key, append=append, dataframe=backup_df)
[columns_list.extend(t) for t in return_tags()] df = pd.DataFrame(columns=columns_list) for dict in dicts: print(dict) df = df.append(dict, ignore_index=True).fillna(0) print(df) return df if __name__ == "__main__": sleep(1000) get_news_tags() tag = read_hdf5(files["tags_df"], "twitter").AMD.values comb = read_hdf5(files["tags_df"], "twitter").tags_combined.values for t, c in zip(tag, comb): print(t / c) sleep(400) pprint(return_tags()) dfs = split_hourly(read_hdf5(files["news_store"], "twitter_conversations")) tags_dicts = [] for df in dfs: d = tags_from_df(df, title=False) tags_dicts.append(d) df_interval = dicts_to_df(tags_dicts) update_hdf5(files["tags_df"], "twitter", dataframe=df_interval, append=False)
def get_conversations(self, since_id=0, tags=(), items_batch=25): if not tags: tags = [t for t in twitter_tags() if len(t) > 1] log(f"Getting raw data for {tags}, since_id={since_id}") search_str = "" try: data = read_hdf5(files["news_store"], key="twitter_conversations") ids = [int(i) for i in data.id.values] self.ids.extend(ids) since_id = max(ids) except Exception as e: since_id = 1252311068950638593 error(e) if not since_id: try: ids = read_hdf5(self.store_dir, "twitter_conversations")["id"].values ids_ = True except (AttributeError, KeyError, TypeError) as e: error(str(e)) ids_ = False if not ids_: since_id = 0 else: since_id = max(ids) df = pd.DataFrame(columns=[ "created", "id", "retweets", "text", "user_id", "favorits", "user_followers" ]) tags = [tag for tag in tags if type(tag) is type(str())] for t in tags[:40]: search_str += f"{t} OR " if search_str[-3:] == "OR ": search_str = search_str[:-4] search_str += " -filter:retweets" print(search_str) for i in range(5): try: since_id = max(self.ids) if since_id: cursor = tweepy.Cursor(self.api.search, q=search_str, lang="en", full_text=True, since_id=since_id, wait_on_rate_limit=True, tweet_mode="extended", wait_on_rate_limit_notify=True, retry_delay=5).items(items_batch) else: cursor = tweepy.Cursor(self.api.search, q=search_str, lang="en", full_text=True, wait_on_rate_limit=True, tweet_mode="extended", wait_on_rate_limit_notify=True, retry_delay=5).items(items_batch) for tweet in cursor: j = tweet._json if j["id"] in self.ids: print(str(j["id"]) + " already in wtf") else: created = date_to_posix(str(j["created_at"]), list=False) if created is not None: #print(j["created_at"]) #date = datetime.datetime.strptime(j["created_at"], "%a %b %H %M %S %z %-y").replace(tzinfo=timezone.utc).timestamp() #print(date) data = { "created": float(created), "id": str(j["id"]), "retweets": str(j["retweet_count"]), "text": str(j["full_text"]), "user_id": str(j["user"]["id"]), "favorits": str(j["favorite_count"]), "user_followers": str(j["user"]["followers_count"]) } #tag_str(j["full_text"], as_str=True) self.ids.append(int(j["id"])) if len(data["text"]) >= 333: print(data["text"], "left out") else: if len(data["text"]) > 25: df = df.append(data, ignore_index=True) else: print(j) df.set_index("created", inplace=True) print(df) self.ids.extend([int(v) for v in df.id.values]) update_hdf5(files["news_store"], key="twitter_conversations", dataframe=df, append=True) df = pd.DataFrame(columns=[ "created", "id", "retweets", "text", "user_id", "favorits", "user_followers" ]) except FileNotFoundError as e: error(str(e))
def gather_different(self, extra_urls=None, only_extra=False, ignore_gotten=False, save=True): checklang = False if extra_urls: self.urls["extras"] = set(extra_urls) for url_ext in extra_urls: mine_article(url_ext) if not only_extra: print(self.newssites) if len(self.newssites) > 1 and type(self.newssites) is list: papers = [ build(paper, config=self.config) for paper in self.newssites ] else: papers = build(self.newssites, config=self.config) log(f"Getting Data from {len(self.newssites)} newssites...") news_pool.set(papers, threads_per_source=2) news_pool.join() for art_pool, url in zip(papers, self.newssites): print( f"Handling newssite {int(self.newssites.index(url)) + 1}/{len(self.newssites)}" ) for art in art_pool.articles: art.parse() if (str(art.url) not in self.urls["gotten"]) or ignore_gotten: created = date_to_posix(dates=art.publish_date, list=False) if created is not None and created != "None": dic_temp = { "link": str(art.url), "text": str( art.text.replace(" ", "").replace("\n", "")), "title": str(art.title), "created": float(created), "keywords": str(art.keywords), "author": str(art.authors) } self.urls["gotten"] = np.append( self.urls["gotten"], art.url) if checklang: try: if check_lang_is_en(str(art.text)): self.df_art = self.df_art.append( dic_temp, ignore_index=True) else: print(f"Blocked: {dic_temp['text']}") except json.decoder.JSONDecodeError as e: error(e) if check_lang_is_en(str(art.title)): self.df_art = self.df_art.append( dic_temp, ignore_index=True) else: print(f"Blocked: {dic_temp['text']}") print("fixed?") else: self.df_art = self.df_art.append( dic_temp, ignore_index=True) if save: print(self.df_art) try: pass #print(self.df_art.to_string()) except: pass update_hdf5(files["news_store"], "news_articles", dataframe=self.df_art, mode="a", append=False)