Example #1
0
    def __init__(self):
        twitter_consumer_key = read_yaml(files["auth"])["twitter_consumer_key"]
        twitter_secret_consumer_key = read_yaml(
            files["auth"])["twitter_secret_consumer_key"]
        twitter_access_token_key = read_yaml(
            files["auth"])["twitter_access_token_key"]
        twitter_secret_access_token_key = read_yaml(
            files["auth"])["twitter_secret_access_token_key"]

        auth = tweepy.OAuthHandler(twitter_consumer_key,
                                   twitter_secret_consumer_key)
        auth.set_access_token(twitter_access_token_key,
                              twitter_secret_access_token_key)
        self.ids = []
        self.store_dir = files["news_store"]

        self.api = tweepy.API(auth)

        try:
            self.api.verify_credentials()
            log("Authentication OK")

        except:
            error("Error during authentication")
            log("Error during authentication")
        try:
            data = read_hdf5(files["news_store"], key="twitter_conversations")
            ids = [int(i) for i in data.id.values]
            since_id = max(ids)
        except Exception as e:
            since_id = 1252311068950638593
            error(e)
        print(since_id)
        self.get_conversations(since_id=since_id)
Example #2
0
def read_args(args):
    if len(args)>2:
        print "Too many arguments given: ",len(args)
        error()
    elif len(args)<2:
        print "Missing file arguments"
        error()
    else:
       return args[0], args[1]
Example #3
0
    def __init__(self):
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
                     "(KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.44"
        self.config = Config()
        self.config.browser_user_agent = user_agent
        self.config.memoize_articles = True
        self.config.verbose = True
        self.config.language = "en"

        ignore_already_gotten = True
        #nono_host_words = ["arabic", "espanol", "latino", "latina"]
        self.df_art_base = pd.DataFrame(
            columns=["link", "text", "title", "created", "keywords", "author"])
        try:
            self.newssites = ([
                "https://www.cnbc.com", 'http://cnn.com',
                "http://www.huffingtonpost.com", "http://www.nytimes.com",
                "http://news.bbc.co.uk/", "http://www.theguardian.com/"
            ])

        except Exception as e:
            error(str(e))
        self.urls = {}
        try:
            if len(read_hdf5(files["news_store"], "news_articles")) <= 1:
                error("Too short new")
                del_hdf5_key(["news_store"], "news_articles")
                self.df_art = pd.DataFrame(columns=[
                    "link", "text", "title", "created", "keywords", "author"
                ])
            else:
                self.df_art = read_hdf5(files["news_store"], "news_articles")
        except KeyError as e:
            error(e)
            self.df_art = pd.DataFrame(columns=[
                "link", "text", "title", "created", "keywords", "author"
            ])
        except TypeError:
            error("Too short new")
            del_hdf5_key(files["news_store"], "news_articles")
            self.df_art = pd.DataFrame(columns=[
                "link", "text", "title", "created", "keywords", "author"
            ])
        try:
            self.urls["gotten"] = read_hdf5(files["news_store"],
                                            "news_articles").link.values
        except (KeyError, AttributeError) as e:
            error(e)
            self.urls["gotten"] = []
Example #4
0
    def __init__(self):
        #del_hdf5_key(files["news_store"], "reddit")

        try:
            if len(read_hdf5(files["news_store"], "reddit")) <= 1:
                error("Too short new")
                self.df_subs = pd.DataFrame(columns=[
                    "id", "text", "title", "created", "score", "comments",
                    "comment_num"
                ])
            else:
                self.old_save = True
                self.df_subs = read_hdf5(files["news_store"], "reddit")
        except KeyError as e:
            error(e)
            self.df_subs = pd.DataFrame(columns=[
                "id", "text", "title", "created", "score", "comments",
                "comment_num"
            ])
        try:
            self.ids = list(read_hdf5(files["news_store"], "reddit").id.values)
        except KeyError as e:
            error(e)
            self.ids = list()
        user_agent = "Windows: News Analyser :v1.0 (by /u/ludvig127)"
        self.red = praw.Reddit(
            user_agent=user_agent,
            client_id=read_yaml(files["auth"])["reddit_client_id"],
            client_secret=read_yaml(files["auth"])["reddit_client_secret"])
Example #5
0
    def scrape_subreddit(self,
                         added_subreddits=(),
                         items=10,
                         return_items=False,
                         only_added=False):
        if only_added:
            subreddits_ = added_subreddits
        else:
            subreddits_ = subreddits(added=added_subreddits)
        if items is None:
            items = 50
        subreddits_gotten = []
        for subreddit in subreddits_:
            hot_posts = self.red.subreddit(subreddit).hot(limit=items)
            subreddits_gotten.append((hot_posts, subreddit))
        comments = {}
        for sub in subreddits_gotten:
            comments[sub] = self.parse_submission_obj(sub)
        try:
            with open(files["reddit_comments"], "rb") as f:
                old = pickle.load(f)
        except (FileNotFoundError, EOFError) as e:
            old = {}
            error(e)
        with open(files["reddit_comments"], "wb") as f:
            if old is not None:
                pickle.dump(comments.update(old), f)
            else:
                pickle.dump(comments, f)
        print(self.df_subs)
        update_hdf5(files["news_store"],
                    "reddit",
                    dataframe=self.df_subs,
                    mode="a",
                    append=False)

        if return_items:
            return self.df_subs, comments
Example #6
0
def main(argv):
    save = ""
    force = False
    generate = "log-uniform"
    search_mode = "fix-grid-search"

    try:
        opts, args = getopt.getopt(argv,"vhfo:s:g:",
                        ["verbose","help","force","out=","search=","generate="])
    except getopt.GetoptError as getopt_error:
        print getopt_error.msg, getopt_error.opt
        error()
    else:
        for opt, arg in opts:
            if opt in ("-h", "--help"):
                show_help()
                sys.exit()
            elif opt in ("-v","--verbose"):
                global _verbose
                _verbose = True
            elif opt in ("-f","--force"):
                force = True
            elif opt in ("-o","--out"):
                save = re.sub('.yaml$','',arg)
            elif opt in ("-g","--generate"):
                if arg not in generation_modes.keys():
                    print "generate MODE is invalid: " +arg
                    error()
                generate = arg
            elif opt in ("-s","--search"):
                if arg not in search_modes.keys():
                    print "search MODE is invalid: " +arg
                    error()
                search_mode = arg

    template, hparams = read_args(args)

    if not save:
        save = re.sub('.yaml$','',args[0])

    hpnames, hpvalues = generate_params(hparams,generate,search_mode)

    # fill template
    template = ''.join(template)

    write_files(''.join(open(template,'r')),hpnames,hpvalues,save,force=force)

    if _verbose:
        print '\n'.join(files)+'\n'
Example #7
0
    def get_conversations(self, since_id=0, tags=(), items_batch=25):

        if not tags:
            tags = [t for t in twitter_tags() if len(t) > 1]
        log(f"Getting raw data for {tags}, since_id={since_id}")
        search_str = ""
        try:
            data = read_hdf5(files["news_store"], key="twitter_conversations")
            ids = [int(i) for i in data.id.values]
            self.ids.extend(ids)
            since_id = max(ids)
        except Exception as e:
            since_id = 1252311068950638593
            error(e)
        if not since_id:
            try:
                ids = read_hdf5(self.store_dir,
                                "twitter_conversations")["id"].values
                ids_ = True
            except (AttributeError, KeyError, TypeError) as e:
                error(str(e))
                ids_ = False
            if not ids_:
                since_id = 0
            else:
                since_id = max(ids)
        df = pd.DataFrame(columns=[
            "created", "id", "retweets", "text", "user_id", "favorits",
            "user_followers"
        ])
        tags = [tag for tag in tags if type(tag) is type(str())]

        for t in tags[:40]:
            search_str += f"{t} OR "
        if search_str[-3:] == "OR ":
            search_str = search_str[:-4]
        search_str += " -filter:retweets"
        print(search_str)

        for i in range(5):
            try:
                since_id = max(self.ids)
                if since_id:
                    cursor = tweepy.Cursor(self.api.search,
                                           q=search_str,
                                           lang="en",
                                           full_text=True,
                                           since_id=since_id,
                                           wait_on_rate_limit=True,
                                           tweet_mode="extended",
                                           wait_on_rate_limit_notify=True,
                                           retry_delay=5).items(items_batch)
                else:
                    cursor = tweepy.Cursor(self.api.search,
                                           q=search_str,
                                           lang="en",
                                           full_text=True,
                                           wait_on_rate_limit=True,
                                           tweet_mode="extended",
                                           wait_on_rate_limit_notify=True,
                                           retry_delay=5).items(items_batch)
                for tweet in cursor:
                    j = tweet._json
                    if j["id"] in self.ids:
                        print(str(j["id"]) + " already in wtf")
                    else:
                        created = date_to_posix(str(j["created_at"]),
                                                list=False)

                        if created is not None:
                            #print(j["created_at"])
                            #date = datetime.datetime.strptime(j["created_at"], "%a %b %H %M %S %z %-y").replace(tzinfo=timezone.utc).timestamp()
                            #print(date)

                            data = {
                                "created": float(created),
                                "id": str(j["id"]),
                                "retweets": str(j["retweet_count"]),
                                "text": str(j["full_text"]),
                                "user_id": str(j["user"]["id"]),
                                "favorits": str(j["favorite_count"]),
                                "user_followers":
                                str(j["user"]["followers_count"])
                            }
                            #tag_str(j["full_text"], as_str=True)

                            self.ids.append(int(j["id"]))
                            if len(data["text"]) >= 333:
                                print(data["text"], "left out")
                            else:
                                if len(data["text"]) > 25:
                                    df = df.append(data, ignore_index=True)
                        else:
                            print(j)

                df.set_index("created", inplace=True)
                print(df)

                self.ids.extend([int(v) for v in df.id.values])
                update_hdf5(files["news_store"],
                            key="twitter_conversations",
                            dataframe=df,
                            append=True)
                df = pd.DataFrame(columns=[
                    "created", "id", "retweets", "text", "user_id", "favorits",
                    "user_followers"
                ])

            except FileNotFoundError as e:
                error(str(e))
Example #8
0
    def gather_different(self,
                         extra_urls=None,
                         only_extra=False,
                         ignore_gotten=False,
                         save=True):
        checklang = False
        if extra_urls:
            self.urls["extras"] = set(extra_urls)
            for url_ext in extra_urls:
                mine_article(url_ext)
        if not only_extra:
            print(self.newssites)
            if len(self.newssites) > 1 and type(self.newssites) is list:
                papers = [
                    build(paper, config=self.config)
                    for paper in self.newssites
                ]
            else:
                papers = build(self.newssites, config=self.config)
            log(f"Getting Data from {len(self.newssites)} newssites...")
            news_pool.set(papers, threads_per_source=2)
            news_pool.join()
            for art_pool, url in zip(papers, self.newssites):
                print(
                    f"Handling newssite {int(self.newssites.index(url)) + 1}/{len(self.newssites)}"
                )
                for art in art_pool.articles:
                    art.parse()
                    if (str(art.url)
                            not in self.urls["gotten"]) or ignore_gotten:
                        created = date_to_posix(dates=art.publish_date,
                                                list=False)
                        if created is not None and created != "None":
                            dic_temp = {
                                "link":
                                str(art.url),
                                "text":
                                str(
                                    art.text.replace("  ",
                                                     "").replace("\n", "")),
                                "title":
                                str(art.title),
                                "created":
                                float(created),
                                "keywords":
                                str(art.keywords),
                                "author":
                                str(art.authors)
                            }
                            self.urls["gotten"] = np.append(
                                self.urls["gotten"], art.url)
                            if checklang:
                                try:
                                    if check_lang_is_en(str(art.text)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                except json.decoder.JSONDecodeError as e:
                                    error(e)
                                    if check_lang_is_en(str(art.title)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                    print("fixed?")
                            else:
                                self.df_art = self.df_art.append(
                                    dic_temp, ignore_index=True)

        if save:
            print(self.df_art)
            try:
                pass
                #print(self.df_art.to_string())
            except:
                pass
            update_hdf5(files["news_store"],
                        "news_articles",
                        dataframe=self.df_art,
                        mode="a",
                        append=False)
Example #9
0
    def parse_submission_obj(self, obj):
        c = 0
        comment_dict = {}
        for item in obj[0]:
            c += 1
            try:
                eng = check_lang_is_en(item.title)
            except ValueError as e:
                print(str(e))
                eng = check_lang_is_en(item.title, safe=True, lenght_req=False)

            if eng:
                title = item.title
                id = item.id
                print(title)
                total_comment_score = [0, 0]
                comment_list = []
                comment = []
                for com in item.comments:
                    dict_top = dict()
                    try:
                        dict_top["created"], dict_top["score"], dict_top["body"], dict_top["id"], dict_top["parent_id"], \
                        dict_top["top_comment"] = \
                            com.created_utc, com.score, com.body, com.id, com.parent_id, True
                    except AttributeError as e:
                        error(e)
                        continue
                    total_comment_score[0] += abs(com.score)
                    total_comment_score[1] += com.score
                    comment.append(dict_top)

                    for repl in com.replies:
                        dict_top = dict()
                        try:
                            dict_top["created"], dict_top["score"], dict_top["body"], dict_top["id"], dict_top["parent_id"], \
                            dict_top["top_comment"] = \
                                repl.created_utc, repl.score, repl.body, repl.id, repl.parent_id, False
                        except AttributeError as e:
                            if "MoreComments" in e:
                                pass
                            else:
                                error(str(e))
                        comment.append(dict_top)
                        total_comment_score[0] += abs(repl.score)
                        total_comment_score[1] += repl.score
                    comment_list.append(comment)
                data = {
                    "id": str(id),
                    "text": str(item.selftext.replace("\n", "")),
                    "title": str(title),
                    "created": float(item.created_utc),
                    "comment_num": str(item.num_comments),
                    "sub": str(item.subreddit.name),
                    "score": f"{str(item.score)}:{str(item.upvote_ratio)}"
                }
                if id in self.ids:
                    self.df_subs = self.df_subs[self.df_subs.id != id]
                    self.df_subs = self.df_subs.append(data, ignore_index=True)
                else:
                    self.df_subs = self.df_subs.append(data, ignore_index=True)
                self.ids.append(id)
                comment_dict[item.id] = comment_list
        return comment_dict
Example #10
0
        #print(read_hdf5(files["news_store"], "twitter_conversations").columns)

        #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").drop("tag", inplace=False, axis=1), append=False)
        #update_hdf5(files["news_store"], "twitter_conversations", read_hdf5(files["news_store"], "twitter_conversations").set_index("created", inplace=False), append=False)

        print(read_hdf5(files["news_store"], "twitter_conversations"))
        runs += 1
        TwitterNews()
        #print(read_hdf5(files["news_store"], "twitter_conversations").tag)

        try:
            pass
            Reddit.scrape_subreddit(Reddit())
        except Exception as e:
            error(e)
            pass
        #del_hdf5_key(files["news_store"], "news_articles")
        NewsArticles.gather_different(NewsArticles())
        print(read_hdf5(files["news_store"], "twitter_conversations"))
        print(read_hdf5(files["news_store"], "news_articles"))
        print(read_hdf5(files["news_store"], "reddit"))
        if runs == 200:
            log("Doing backup")
            to_backup(files["news_store"],
                      "twitter_conversations",
                      append=False)
            to_backup(files["news_store"], "news_articles", append=False)
            to_backup(files["news_store"], "reddit", append=False)
            runs = 0
        sleep_log(30)