def __init__(self): super(Control_Panel, self).__init__() self.p = Parameters() self.connection = None self.setupUi(self) self.ParamFileModel = QtWidgets.QFileSystemModel(self.ParamList) self.ParamFileModel.setReadOnly(True) self.ParamFileModel.removeColumns(1, 2) root = self.ParamFileModel.setRootPath('save/params') self.ParamList.setModel(self.ParamFileModel) self.ParamList.setRootIndex(root) self.ParamList.clicked.connect( lambda: self.ParamLoadButton.setEnabled(True)) # Connections self.ParamLoadButton.clicked.connect(lambda: self.load_params()) self.ParamTree.itemChanged.connect(lambda: self.update_value()) self.ConnectButton.clicked.connect(lambda: self.mongo_update()) self.ScraperLoad.clicked.connect(lambda: self.scraper_config_load()) self.ScraperDepthSlider.sliderReleased.connect( lambda: self.depth_slider()) self.UseLocalCheckBox.toggled.connect( lambda: self.use_local_check_box()) self.UpsertCheckBox.toggled.connect(lambda: self.upsert_check_box()) self.ScraperButton.clicked.connect(lambda: self.run_scraper()) self.NewlineCheckBox.toggled.connect(lambda: self.newline_check_box()) self.PunctuationCheckBox.toggled.connect( lambda: self.puncuation_check_box()) self.EmailsCheckBox.toggled.connect(lambda: self.emails_check_box()) self.ContradictionsCheckBox.toggled.connect( lambda: self.contradictions_check_box()) self.AccentsCheckBox.toggled.connect(lambda: self.accents_check_box()) self.CurrencyCheckBox.toggled.connect( lambda: self.currency_check_box()) self.FixUnicodeCheckBox.toggled.connect( lambda: self.unicode_check_box()) self.LowercaseCheckBox.toggled.connect( lambda: self.lowercase_check_box()) self.VisualizeButton.clicked.connect(lambda: self.visualizer_start())
def __init__(self, settings=None, mongo_cfg=None): self.settings = settings self.mongo_cfg = mongo_cfg self.mongo = Parameters() self.mongo.loader('dat/mongo.secret', 'server') self._mongo = self.mongo.server.Mongo_DB_Server_Params self.client = MongoClient(host=self._mongo.host, port=self._mongo.port, username=self._mongo.user, password=self._mongo.password) self.db = getattr(self.client, self._mongo.db) self.collection = getattr(self.db, self._mongo.collection) self.query_df = None self.query_dict = {} self.added_count = None
def update_from_df(self, df=None, drop_id=None, upsert=None, set_local=None, verbose=False): try: if not df: df = self.query_df except: pass if not drop_id: drop_id = self.mongo_cfg.Options.Set_Local if not upsert: upsert = self.mongo_cfg.Options.Upsert if not set_local: set_local = self.mongo_cfg.Options.Set_Local if drop_id: df = df.drop(['_id'], axis=1) data = df.to_dict(orient='records') old_count = self.collection.count() for post in data: self.collection.update_one( { 'link': post['link'], 'subreddit': post['subreddit'] }, {'$set': post}, upsert=upsert) new_count = self.collection.count() self.added_count = new_count - old_count if verbose: print(f'Added {self.added_count} Entries to Database') if set_local: time_stamp = datetime.ctime(self.utc_to_pacific(datetime.now())) else: time_stamp = datetime.ctime(datetime.now()) log = Parameters() log.loader('log/mongo.log', default=True) log.loaded.MONGOLOG.Date = time_stamp log.loaded.MONGOLOG.Added = self.added_count log.loaded.MONGOLOG.Total = new_count log.writer('log/scraper.log', log.loaded, append=True)
list_of_terms.append(term_dict) mongo = Connect() client = mongo.client collection = client.Politiprocess.terms start_count = collection.count() for entry in list_of_terms: client.Politiprocess.terms.update_one({"timestamp": entry['timestamp']},{'$set': entry}, upsert=True, bypass_document_validation=True) end_count = collection.count() added_count = end_count - start_count if set_local: time_now = datetime.ctime(utc_to_pacific(datetime.now())) else: time_now = datetime.ctime(datetime.now()) log = Parameters() log.loader('log/JSON.log', default=True) log.loaded.JSONLOG.Date = time_now log.loaded.JSONLOG.Added = added_count log.loaded.JSONLOG.Total = end_count log.writer('log/scraper.log', log.loaded, append=True)
class Connect: def __init__(self, settings=None, mongo_cfg=None): self.settings = settings self.mongo_cfg = mongo_cfg self.mongo = Parameters() self.mongo.loader('dat/mongo.secret', 'server') self._mongo = self.mongo.server.Mongo_DB_Server_Params self.client = MongoClient(host=self._mongo.host, port=self._mongo.port, username=self._mongo.user, password=self._mongo.password) self.db = getattr(self.client, self._mongo.db) self.collection = getattr(self.db, self._mongo.collection) self.query_df = None self.query_dict = {} self.added_count = None def utc_to_pacific(self, utc_dt): local_tz = pytz.timezone('America/Los_Angeles') os.environ['TZ'] = 'America/Los_Angeles' local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_tz.normalize(local_dt) def load_all(self): self.query_df = pd.DataFrame(list(self.collection.find())) def query(self, red_or_blue=None, articles=None, n_hours=None, count=None, custom_query=None, append_dfs=False, verbose=False): self.query_dict = {} if not red_or_blue: red_or_blue = self.settings.Query.Red_Blue_or_All if not articles: articles = self.settings.Query.Articles_Only if not n_hours: n_hours = self.settings.Query.Time_Frame_in_Hours if not append_dfs: append_dfs = self.settings.Query.Append_DFs if not count: count = self.settings.Query.Count else: n_hours = 0 if articles: self.query_dict['is article'] = articles if articles: post = 'articles' else: post = 'documents' if not n_hours: if verbose: print(f"Pulling {count} {post} from {red_or_blue} targets.") else: if verbose: print( f"Pulling {red_or_blue} articles from last {n_hours} hours." ) dt = datetime.utcnow() - timedelta(hours=n_hours) self.query_dict['date'] = {'$gt': dt} if red_or_blue == 'Red': self.query_dict['target'] = True elif red_or_blue == 'Blue': self.query_dict['target'] = False elif red_or_blue == 'All': self.query_dict['target'] = [True, False] if custom_query: self.query_dict = {**self.query_dict, **custom_query} if self.query_dict['target'] == [True, False]: self.query_dict['target'] = True self.red_df = pd.DataFrame( list( self.collection.find(self.query_dict, sort=[('_id', -1)], limit=count))) self.red_df.name = 'Red' self.query_dict['target'] = False self.blue_df = pd.DataFrame( list( self.collection.find(self.query_dict, sort=[('_id', -1)], limit=count))) self.blue_df.name = 'Blue' if append_dfs: self.query_df = self.red_df.append(self.blue_df) self.query_df.name = 'All' if verbose: print(f'''Completed pulling {len(self.query_df)} {post}. Latest article is from {self.collection.find_one(sort=[('date', -1)])['date']} UTC''' ) def update_from_df(self, df=None, drop_id=None, upsert=None, set_local=None, verbose=False): try: if not df: df = self.query_df except: pass if not drop_id: drop_id = self.mongo_cfg.Options.Set_Local if not upsert: upsert = self.mongo_cfg.Options.Upsert if not set_local: set_local = self.mongo_cfg.Options.Set_Local if drop_id: df = df.drop(['_id'], axis=1) data = df.to_dict(orient='records') old_count = self.collection.count() for post in data: self.collection.update_one( { 'link': post['link'], 'subreddit': post['subreddit'] }, {'$set': post}, upsert=upsert) new_count = self.collection.count() self.added_count = new_count - old_count if verbose: print(f'Added {self.added_count} Entries to Database') if set_local: time_stamp = datetime.ctime(self.utc_to_pacific(datetime.now())) else: time_stamp = datetime.ctime(datetime.now()) log = Parameters() log.loader('log/mongo.log', default=True) log.loaded.MONGOLOG.Date = time_stamp log.loaded.MONGOLOG.Added = self.added_count log.loaded.MONGOLOG.Total = new_count log.writer('log/scraper.log', log.loaded, append=True) def count(self, query=None, red_or_blue=None): count_query = {} if red_or_blue == 'Red': count_query['target'] = 1 elif red_or_blue == 'Blue': count_query['target'] = 0 return self.collection.count(count_query)
def run(self, set_local=None, pickle=False, verbose=False): if not set_local: set_local = self.settings.Options.Set_Local if verbose: print('Starting Scraper') start_time = datetime.now() reddit = self.settings.Reddit_Params art_ignore = self.settings.Article.None_Article_Links API = Parameters() API.loader('dat/praw.secret') API = API.loaded.API_Script_Keys api = praw.Reddit(client_id=API.client_id, client_secret=API.client_secret, password=API.password, user_agent=API.user_agent, username=API.username) posts_dict = { "post title": [], "subreddit": [], "score": [], "is article": [], "article title": [], "title polarity": [], "title objectivity": [], "keywords": [], "domain": [], "link": [], "author": [], "text": [], "comments": [], "date": [], "target": [], } article_count = 0 invalid_links = 0 failed_links_c = 0 failed_links = [] red_sub = 0 blue_sub = 0 if verbose: print("Pulling Articles") for sub in reddit.Red_List + reddit.Blue_List: submissions = (x for x in api.subreddit(sub).hot( limit=reddit.Scraper_Depth_Limit) if not x.stickied) for post in submissions: if sub in reddit.Red_List: posts_dict["target"].append(True) red_sub += 1 if sub in reddit.Blue_List: blue_sub += 1 posts_dict["target"].append(False) posts_dict["post title"].append( post.title) # praw reddit scraping to dict posts_dict["link"].append(post.url) posts_dict["score"].append(int(post.score)) posts_dict["subreddit"].append(sub) posts_dict["date"].append( datetime.fromtimestamp(post.created_utc)) comments = [] # Comments parsing and scoring for comment in post.comments: try: if comment.author != 'AutoModerator': comments.append( (round(comment.score / (post.num_comments), 2), comment.body)) except: pass posts_dict["comments"].append(comments) parsed_url = urlparse(post.url) # Parse URL for domain posts_dict['domain'].append(parsed_url.netloc) post_blob = TextBlob(post.title) # TextBlob NLP - VERY SIMPLE posts_dict["title polarity"].append(post_blob.sentiment[0]) posts_dict["title objectivity"].append(post_blob.sentiment[1]) posts_dict["keywords"].append(post_blob.noun_phrases) article = Article(post.url) # Instantiate newspaper3k library if article.is_valid_url( ) and parsed_url.netloc not in art_ignore: try: # Try to download and parse article article.download() article.parse() article_count += 1 posts_dict["is article"].append(True) if article.title != []: # Title parsed? posts_dict["article title"].append(article.title) else: posts_dict["article title"].append(np.nan) if article.authors != []: # Author parsed? posts_dict["author"].append(article.authors) else: posts_dict["author"].append(np.nan) if article.text != []: # Text parsed? posts_dict['text'].append(article.text) else: posts_dict["text"].append(np.nan) except: posts_dict["is article"].append(False) posts_dict["article title"].append(np.nan) posts_dict["author"].append(np.nan) posts_dict["text"].append(np.nan) failed_links_c += 1 failed_links.append(post.url) else: invalid_links += 1 posts_dict["is article"].append(False) posts_dict["article title"].append(np.nan) posts_dict["author"].append(np.nan) posts_dict["text"].append(np.nan) if set_local: time_now = self.utc_to_pacific(datetime.now()) else: time_now = datetime.now() # Set local Time log_date = time_now.strftime('%m%d%y_%H%M') if verbose: print("Generating DataFrame") posts_df = pd.DataFrame(posts_dict) # Make it a dataframe posts_df = posts_df[[ "subreddit", "post title", "title polarity", "title objectivity", "score", "keywords", "comments", "domain", "link", "is article", "article title", "author", "text", "date", "target" ]] if pickle: posts_df.to_pickle(f'log/{log_date}.pickle') z = datetime.now() - start_time self.scrape_time = f"{(z.seconds//60)%60}min, {z.seconds%60}sec" log = Parameters() log.loader('log/scraper.log', 'loaded', default=True) log.loaded.SCRAPERLOG.Date = time_now.ctime() log.loaded.SCRAPERLOG.Scraper_Timer = self.scrape_time log.loaded.SCRAPERLOG.Article_Count = article_count log.loaded.SCRAPERLOG.Invalid_Links = invalid_links log.loaded.SCRAPERLOG.Failed_Links = failed_links log.loaded.SCRAPERLOG.Failed_Links_Count = failed_links_c log.loaded.SCRAPERLOG.Red_Sub_Count = red_sub log.loaded.SCRAPERLOG.Blue_Sub_Count = blue_sub log.writer('log/scraper.log', log.loaded, append=True) log.writer('log/scraper.log', self.settings, append=True) self.scraper_df = posts_df
parser = argparse.ArgumentParser(description='Settings for scraper_script') parser.add_argument('-v', '--verbose', help='Use for verbose output to console.', action='store_true') parser.add_argument('-sl', '--set_local', help='Use for setting local time.', action='store_true') args = parser.parse_args() verbose = args.verbose set_local = args.set_local p = Parameters() p.loader('save/params/default.params', 'params') p.loader('dat/scraper.cfg', 'scraper') scraper = Scraper(p.scraper) scraper.run(verbose=verbose, set_local=set_local) processor = Processing(p.scraper) processor.pre_processor(scraper.scraper_df) processor.spacy_processor(scraper.scraper_df, verbose=verbose) connection = Connect(settings=p.params, mongo_cfg=p.scraper)
class Control_Panel(base, ui): def __init__(self): super(Control_Panel, self).__init__() self.p = Parameters() self.connection = None self.setupUi(self) self.ParamFileModel = QtWidgets.QFileSystemModel(self.ParamList) self.ParamFileModel.setReadOnly(True) self.ParamFileModel.removeColumns(1, 2) root = self.ParamFileModel.setRootPath('save/params') self.ParamList.setModel(self.ParamFileModel) self.ParamList.setRootIndex(root) self.ParamList.clicked.connect( lambda: self.ParamLoadButton.setEnabled(True)) # Connections self.ParamLoadButton.clicked.connect(lambda: self.load_params()) self.ParamTree.itemChanged.connect(lambda: self.update_value()) self.ConnectButton.clicked.connect(lambda: self.mongo_update()) self.ScraperLoad.clicked.connect(lambda: self.scraper_config_load()) self.ScraperDepthSlider.sliderReleased.connect( lambda: self.depth_slider()) self.UseLocalCheckBox.toggled.connect( lambda: self.use_local_check_box()) self.UpsertCheckBox.toggled.connect(lambda: self.upsert_check_box()) self.ScraperButton.clicked.connect(lambda: self.run_scraper()) self.NewlineCheckBox.toggled.connect(lambda: self.newline_check_box()) self.PunctuationCheckBox.toggled.connect( lambda: self.puncuation_check_box()) self.EmailsCheckBox.toggled.connect(lambda: self.emails_check_box()) self.ContradictionsCheckBox.toggled.connect( lambda: self.contradictions_check_box()) self.AccentsCheckBox.toggled.connect(lambda: self.accents_check_box()) self.CurrencyCheckBox.toggled.connect( lambda: self.currency_check_box()) self.FixUnicodeCheckBox.toggled.connect( lambda: self.unicode_check_box()) self.LowercaseCheckBox.toggled.connect( lambda: self.lowercase_check_box()) self.VisualizeButton.clicked.connect(lambda: self.visualizer_start()) # Functions def load_params(self): self.ParamTree.clear() file = self.ParamFileModel.data(self.ParamList.selectedIndexes()[0]) self.p.loader(f"save/params/{file}", 'params') self.ParamTree.setEnabled(True) self.ParamLoadedLabel.setText(f"{file}") self.ParamTree.setHeaderLabels(['Section', 'Value']) for section, value in self.p.params_dict.items(): # print(key) root = QtWidgets.QTreeWidgetItem(self.ParamTree, [section]) root.setExpanded(True) for key, val in value.items(): if isinstance(val, list): item = QtWidgets.QTreeWidgetItem([key]) for thing in val: item2 = QtWidgets.QTreeWidgetItem() item2.setData(1, 2, str(thing)) item2.setFlags(item.flags() | QtCore.Qt.ItemIsEditable) item.addChild(item2) root.addChild(item) continue item = QtWidgets.QTreeWidgetItem([key]) item.setData(1, 2, val) item.setFlags(item.flags() | QtCore.Qt.ItemIsEditable) root.addChild(item) def update_value(self): # if self.ParamTree.currentItem(). value = self.ParamTree.currentItem().data(1, 2) name = self.ParamTree.currentItem().text(0) parent = self.ParamTree.currentItem().parent().text(0) self.ParamTree.currentItem().setForeground( 1, QtGui.QBrush(QtGui.QColor("red"))) self.p.params_dict[parent][name] = value def mongo_update(self): if not self.connection: self.connection = Connect() total = self.connection.count() red_count = self.connection.collection.count(query={'target': True}) blue_count = self.connection.collection.count(query={'target': False}) article_count = self.connection.collection.count( query={'is article': True}) latest_article = self.connection.collection.find_one( sort=[('date', -1)])['date'] if self.connection.added_count: self.AddedCount.display(self.connection.added_count) self.LatestArticleDate.setText(datetime.ctime(latest_article)) self.ConnectButton.setStyleSheet('background-color: green') self.ConnectButton.setText('CONNECTED') self.TotalCount.display(total) self.RedCount.display(red_count) self.BlueCount.display(blue_count) self.ArticleCount.display(article_count) def scraper_config_load(self): self.p.loader(f"dat/scraper.cfg", 'scraper') self.ScraperDepthSlider.setEnabled(True) self.ScraperDepthNumber.setEnabled(True) self.UseLocalCheckBox.setEnabled(True) self.UpsertCheckBox.setEnabled(True) self.ScraperButton.setEnabled(True) self.ScraperLists.setEnabled(True) self.NewlineCheckBox.setEnabled(True) self.PunctuationCheckBox.setEnabled(True) self.EmailsCheckBox.setEnabled(True) self.ContradictionsCheckBox.setEnabled(True) self.AccentsCheckBox.setEnabled(True) self.CurrencyCheckBox.setEnabled(True) self.FixUnicodeCheckBox.setEnabled(True) self.LowercaseCheckBox.setEnabled(True) self.ScraperDepthSlider.setValue( self.p.scraper_dict['Reddit_Params']['Scraper_Depth_Limit']) self.UseLocalCheckBox.setChecked( self.p.scraper_dict['Options']['Set_Local']) self.UpsertCheckBox.setChecked( self.p.scraper_dict['Options']['Upsert']) self.NewlineCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Remove_Newline']) self.PunctuationCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Remove_Punctuation']) self.EmailsCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Remove_Emails']) self.ContradictionsCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Remove_Contradictions']) self.AccentsCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Remove_Accents']) self.CurrencyCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Replace_Currency']) self.FixUnicodeCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['Fix_Unicode']) self.LowercaseCheckBox.setChecked( self.p.scraper_dict['Pre_Processing']['All_Lowercase']) self.scraper_lists() def depth_slider(self): self.p.scraper_dict['Reddit_Params'][ 'Scraper_Depth_Limit'] = self.ScraperDepthSlider.value() def use_local_check_box(self): if self.UseLocalCheckBox.checkState() == 2: self.p.scraper_dict['Options']['Set_Local'] = True else: self.p.scraper_dict['Options']['Set_Local'] = False def upsert_check_box(self): if self.UseLocalCheckBox.checkState() == 2: self.p.scraper_dict['Options']['Set_Local'] = True else: self.p.scraper_dict['Options']['Set_Local'] = False def newline_check_box(self): if self.NewlineCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Remove_Newline'] == True else: self.p.scraper_dict['Pre_Processing']['Remove_Newline'] == False def puncuation_check_box(self): if self.PunctuationCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Remove_Punctuation'] == True else: self.p.scraper_dict['Pre_Processing'][ 'Remove_Punctuation'] == False def emails_check_box(self): if self.EmailsCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Remove_Emails'] == True else: self.p.scraper_dict['Pre_Processing']['Remove_Emails'] == False def contradictions_check_box(self): if self.ContradictionsCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing'][ 'Remove_Contradictions'] == True else: self.p.scraper_dict['Pre_Processing'][ 'Remove_Contradictions'] == False def accents_check_box(self): if self.AccentsCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Remove_Accents'] == True else: self.p.scraper_dict['Pre_Processing']['Remove_Accents'] == False def currency_check_box(self): if self.CurrencyCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Replace_Currency'] == True else: self.p.scraper_dict['Pre_Processing']['Replace_Currency'] == False def unicode_check_box(self): if self.FixUnicodeCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['Fix_Unicode'] == True else: self.p.scraper_dict['Pre_Processing']['Fix_Unicode'] == False def lowercase_check_box(self): if self.LowercaseCheckBox.checkState() == 2: self.p.scraper_dict['Pre_Processing']['All_Lowercase'] == True else: self.p.scraper_dict['Pre_Processing']['All_Lowercase'] == False def scraper_lists(self): self.ScraperLists.clear() for x in self.p.scraper_dict: for section, value in self.p.scraper_dict[x].items(): if isinstance(value, list): root = QtWidgets.QTreeWidgetItem(self.ScraperLists, [section]) root.setExpanded(False) for thing in value: item = QtWidgets.QTreeWidgetItem() item.setData(1, 2, str(thing)) root.addChild(item) def run_scraper(self): self.p.linker(self.p.scraper_dict, 'scraper') scraper = Scraper(self.p.scraper) processing = Processing(self.p.scraper) scraper.run() self.ProgressBar.setValue(25) processing.pre_processor(scraper.scraper_df) self.ProgressBar.setValue(50) processing.spacy_processor(scraper.scraper_df) self.ProgressBar.setValue(75) self.connection = Connect(settings=None, mongo_cfg=self.p.scraper) self.connection.update_from_df(scraper.scraper_df) self.mongo_update() self.ProgressBar.setValue(100) # self.AddedCount. def visualizer_start(self): self.p.linker(self.p.params_dict, 'params') self.connection.settings = self.p.params self.connection.query() if self.connection.settings.Query.Red_Blue_or_All == 'All': red_topics = Topic_Modeler(self.connection.red_df, self.p.params) red_topics.topic_modeler() red_topics.visualizer() image1 = QtGui.QPixmap(red_topics.save) image1 = image1.scaledToWidth(600, QtCore.Qt.SmoothTransformation) self.RedPlotView.resize(600, image1.height()) self.RedPlotView.setPixmap(image1) blue_topics = Topic_Modeler(self.connection.blue_df, self.p.params) blue_topics.topic_modeler() blue_topics.visualizer() image2 = QtGui.QPixmap(blue_topics.save) image2 = image2.scaledToWidth(600, QtCore.Qt.SmoothTransformation) self.BluePlotView.resize(600, image2.height()) self.BluePlotView.setPixmap(image2)