def getMigrationScript(collection): dbcon = DBConnection() db = MongoConsts.DB uri = dbcon.getRemoteConnectionString() str = f""" mongoexport --db={db} --collection={collection} --out={collection}.json mongoimport --collection={collection} --file={collection}.json --uri={uri} """ return str
def getDF(): connection = DBConnection() collection = connection.indianMediaVideoCollection rows = [] for vid in collection.find(): try: typeList = vid["kind"] == Channels.VID_TYPE_LIST info = vid["items"][0] if typeList else vid url = f"www.youtube.com/watch?v={info['id']}" info = info["snippet"] channelId = Channels.reverseLookup(info["channelId"]) ptitle = vid["playlist"]["snippet"]["title"] if "snippet" in vid[ "playlist"] else "" title = info["title"] desc = info["description"] date = info["publishedAt"] except Exception as e: print(vid.keys()) print(vid) print(e) raise e rows.append([channelId, ptitle, date, title, desc, url]) header = [ "Channel Id", "Playlist Title", "Date", "Title", "Description", "Url" ] df = pd.DataFrame(rows, columns=header) return df
def test_save_article_to_db(self): self.sut = GoodNewsNetwork(DBConnection(), 1, 1) self.sut.save_article_to_db({"_id": "abc", "content": "asd"}) q = {"_id": "abc"} coll = self.sut.dbconn.getCollection(MongoConsts.GOOD_NEWS_COLLECTION) a = coll.find_one(q) self.assertIsNotNone(a) coll.delete_one(q) a = coll.find_one(q) self.assertIsNone(a)
def getWordDatesDF(limit=None): connection = DBConnection() collection = connection.getCollection(MongoConsts.WORD_DATE_COLLECTION) dfs = [] r = collection.find() if limit == None else collection.find().limit(limit) for vid in r: df = pd.read_json(json.dumps(vid["ts"]), orient="index") df.columns = [vid["word_id"]] dfs.append(df) merged = pd.concat(dfs, axis=1) merged.index = pd.MultiIndex.from_tuples( [literal_eval(i) for i in merged.index]) merged = merged.reset_index() merged["level_0"] = pd.to_datetime(merged["level_0"], format="%m_%d_%y") merged = merged.sort_values("level_0") merged["level_0"] = merged["level_0"].dt.strftime("%m_%d_%y") merged = merged.set_index(["level_0", "level_1"]) return merged
def __init__(self): self.dtype_lookup = { FlatFiles.WORD_BY_DATE: { "channel_id": "category", "date": "str", "variable": "category", "value": "float16", "date_month": "int8", "date_week": "int8" } } self.cache = {} self.db = DBConnection()
def setUp(self): dbconn = DBConnection() self.sut = TrendRank(dbconn, TrendRankTest.TermRankCollection)
def get_article_content(self, url): try: resp = requests.get(url) soup = BeautifulSoup(resp.text) content = soup.select_one( GoodNewsNetwork.Selectors.POST_CONTENT).text except Exception as e: logging.error(f"Failed to get article - {url} - {e}") content = None return content if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--start-page", default=1, type=int, help="Page to start scraping from") parser.add_argument( "--max-pages", default=1, type=int, help="Maximum Number of Pages to extract from the site") args = parser.parse_args() dbconn = DBConnection() GoodNewsNetwork(dbconn, args.start_page, args.max_pages).scrape_articles()
return self.getDBConnection().articleCollection.delete_one({"url" : url}) def getArticleFromDBByUrl(self,url): return self.getDBConnection().articleCollection.find_one({"url" :url}) def getAllArticlesFromDB(self): retrival_query = {"text" : {"$not" : {"$regex" : ".*(403|Forbidden|FAILED_TO_LOAD_PAGE).*"}}} return self.getDBConnection().articleCollection.find(retrival_query) def getAllArticlesFromDBAsDf(self): articles = self.getAllArticlesFromDB() df = pd.DataFrame(articles) return df def writeAllArticlesToCSV(self,fpath): return self.writeArticlesToCSV(fpath ,self.getAllArticlesFromDB()) def writeArticlesToCSV(self ,fpath, articles): df= pd.DataFrame(articles) print(df.shape) df.to_csv(fpath ,index=False) def runJob(self ,fpath="./test.csv", limit=-1): df = self.getDF() self.saveAllUrlContentInDB(df , limit) self.writeAllArticlesToCSV(fpath) if __name__ == "__main__": dbC = DBConnection() ArticleScraper(dbC).writeAllArticlesToCSV("./test.csv")
def migrate(collection): dbcon = DBConnection() localCstr = dbcon.getLocalConnectionString() remoteClient = dbcon.getRemoteClient() db = remoteClient[MongoConsts.DB] db.cloneCollection(localCstr, f'{MongoConsts.DB}.{collection}')
def __init__(self): super().__init__(DBConnection() , MongoConsts.TERM_RANK_COLLECTION)
d.index = pd.MultiIndex.from_tuples([literal_eval(i) for i in d.index]) d.columns= [name] jsis.append(d) if len(jsis) < 1: return None df = pd.concat(jsis, axis=1) #df= self._get_all_series_ranked(df).reset_index().rename(columns={"level_0":TrendRank.COLS.DATE , "level_1":TrendRank.COLS.CHNL}) df= df.reset_index().rename(columns={"level_0":TrendRank.COLS.DATE , "level_1":TrendRank.COLS.CHNL}) df[TrendRank.COLS.DATE] = pd.to_datetime(df[TrendRank.COLS.DATE] , format="%m_%d_%y") df = df.sort_values(TrendRank.COLS.DATE) return df def run_job(self): logging.info("Starting Job") df = self.get_df() rm = self.get_rank_matrix(df) #print(rm["coronavirus"][rm["coronavirus"] != -1]) self.save_rank_matrix(rm) @singleton class TrendRankDataFrameService(TrendRank): def __init__(self): super().__init__(DBConnection() , MongoConsts.TERM_RANK_COLLECTION) if __name__ == "__main__": TrendRank(DBConnection() , MongoConsts.TERM_RANK_COLLECTION).run_job()
def __init__(self): self.df = getDF() print(self.df.head()) self.dbcon = DBConnection() self.max_terms_to_save = 1000
def __init__(self): dbconn = DBConnection() super().__init__(dbconn, MongoConsts.TD_SVD_COMP_COLLECTION, MongoConsts.TD_SVD_DF_COLLECTION, MongoConsts.TD_TOPICS_COLLECTION, MongoConsts.TD_DOC_TOPICS_COLLECTION)
def setUp(self): self.sut = ArticleScraper(DBConnection())
def __init__(self): dbconn = DBConnection() super().__init__(dbconn , MongoConsts.QD_TRBD_TERM_DIST_COLLECTION , MongoConsts.QD_TRBD_GRP_DIST_COLLECTION)
import math import numpy as np logging.basicConfig(level=logging.INFO) from IndianMedia.constants import Channels, MongoConsts, Creds from IndianMedia.mongointf.pymongoconn import DBConnection try: __file__ except: __file__ = os.path.abspath( os.path.join(".", "..", "Analytics", "IndianMedia", Creds.KEY_FILE)) connection = DBConnection() f = os.path.abspath(os.path.join(os.path.dirname(__file__), Creds.KEY_FILE)) key = open(f, "r").read().strip("\n") api = pyyoutube.Api(api_key=key) #p = pl.items[0] def GetChannelVideoInfo(channelId, daysSince, limit): afterDate = (datetime.datetime.now() + datetime.timedelta(days=-daysSince)).isoformat() + "Z" videos = api.search(parts="snippet", channel_id=channelId, count=limit,