def transport_osdatetime_to_null_infotime_values_in_subscribers(): global g_nsubs_is_none, gcount session = con.Session() subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.infotime == None).\ order_by(sam.YTDailySubscribersSA.infodate).\ all() for i, sub in enumerate(subs): sname = sub.ytchannel.sname filename = autof.form_datedpage_filename_with_triple( sub.infodate, sname, sub.ytchannelid) filepath = autof.form_datedpage_filepath_with_triple( sub.infodate, sname, sub.ytchannelid) print(sname, sub, 'infotime', sub.infotime) osstat_nt = os.stat(filepath) mtime = osstat_nt.st_mtime dt = datetime.datetime.fromtimestamp(mtime) print(i + 1, dt, filepath) pdate = dtfs.convert_datetime_to_date(dt) ptime = dtfs.extract_time_from_datetime(dt) if sub.infodate != pdate: g_nsubs_is_none += 1 line = '============= sub.infodate %s != pdate %s =============' % ( sub.infodate, pdate) print(line) continue gcount += 1 sub.infotime = ptime print('g_nsubs_is_none', g_nsubs_is_none) print('gcount', gcount) session.commit() session.close()
def look_up_dates_data(): session = con.Session() inidate, findate = autof.find_dateini_n_datefin_thru_yyyymmdd_level3_folders() for refdate in dtfs.generate_daterange_with_dateini_n_datefin(inidate, findate): look_up_modifiedtime_of_htmlvideopagefiles(refdate, session) session.close() print(inidate, findate)
def insert(self): shouldbe_abspath = os.path.join(self.canon_filefolder_abspath, self.filename) if not os.path.isfile(shouldbe_abspath): print('filename', self.filename, 'is not in folder', shouldbe_abspath) return False digest = self.get_n_set_sha1_for_file_on_datedfolder_or_None() if digest is None: return False session = saconn.Session() newsarticle = session.query(NewsArticlesSA).filter( NewsArticlesSA.sha1 == digest).first() if newsarticle: print('newsarticle with sha1 %s already in db' % str(digest)) session.close() return False newsarticle = NewsArticlesSA() newsarticle.title = self.filename newsarticle.filename = self.filename newsarticle.sha1 = self.sha1 newsarticle.publishdate = self.refdate print("Addimg ", newsarticle) session.add(newsarticle) session.commit() session.close() return True
def show_dates_n_times_of_videoitems_table(): session = con.Session() videitems = session.query(sam.YTVideoItemInfoSA).all() seq = 0 field_names = [ 'seq', 'ytvideoid', 'nname', 'titletrunc', 'infodate', 'infotime', 'publishdatetime', 'calendarStr', 'created_at' ] ptab = remake_ptab(field_names) for vitem in videitems: seq += 1 ptab.add_row([ seq, vitem.ytvideoid, vitem.ytchannel.nname, vitem.infodate, vitem.infotime, vitem.publishdatetime, vitem.published_time_ago, vitem.created_at, ]) if seq % 40 == 0: print(ptab) ptab = remake_ptab(field_names) # if seq > 400: # break print(ptab)
def scrape_ytchannel(): session = con.Session() ytchannel = session.query(sam.YTChannelSA).filter( sam.YTChannelSA.nname.like('%plantão br%')).first() print(ytchannel) drill.extract_videoitems_from_videopage(ytchannel.ytchannelid, '2020-08-21')
def look_up_empty_infotime_values(): """ :return: """ bool_deleted = False session = con.Session() vviews = session.query(sam.YTVideoViewsSA). \ filter(sam.YTVideoViewsSA.infotime == None). \ all() n_of_vviews_with_null_infotime = len(vviews) print('count vviews', ) for i, vview in enumerate(vviews): print(i+1, vview.infodate, vview.views, vview.ytvideo.title, vview.ytvideo.ytchannel.nname) bool_deleted = True session.delete(vview) if bool_deleted: print('records vviews deleted =', n_of_vviews_with_null_infotime) session.commit() vitems = session.query(sam.YTVideoItemInfoSA). \ filter(sam.YTVideoItemInfoSA.infotime == None). \ all() n_of_vitems_with_null_infotime = len(vitems) print('count vitews', len(vitems)) for i, vitem in enumerate(vitems): print(i+1, vitem.infodate, vitem.ytchannel.nname) # bool_deleted = True # session.delete(vitem) if bool_deleted: total = n_of_vitems_with_null_infotime + n_of_vviews_with_null_infotime print('records deleted =', total, 'vviews', n_of_vviews_with_null_infotime, 'vitems', n_of_vitems_with_null_infotime) session.close()
def find_by_likename(likename): session = con.Session() ytchannel = session.query(sam.YTChannelSA).\ filter(sam.YTChannelSA.nname.like("%" + likename + "%")).first() # print(ytchannel) session.close() return ytchannel
def get_most_recent_video_for(ytchannel): session = con.Session() vinfo = session.query(sam.YTVideoItemInfoSA). \ filter(sam.YTVideoItemInfoSA.ytchannelid == ytchannel.ytchannelid).\ order_by(desc(sam.YTVideoItemInfoSA.publishdatetime)).\ first() session.close() return vinfo
def process_fetch(): session = con.Session() confirm_msg_to_interpolate = None ytchannel, nminutes = get_channel_from_args_with_open_session( session, confirm_msg_to_interpolate) print('ytchannel', ytchannel) for i, vitem in enumerate(ytchannel.vinfolist): if vitem.duration_in_sec and vitem.duration_in_sec < nminutes * 60 + 1: print(vitem.ytvideoid, i + 1, vitem.duration_in_hms, vitem.title) session.close()
def fetch_sname_with_ytchannelid(ytchannelid): session = saconn.Session() ytchannel = session.query(sam.YTChannelSA.nname). \ filter(sam.YTChannelSA.ytchannelid == ytchannelid). \ first() if ytchannel is not None: sname = ytchannel.sname return sname session.close() return None
def fetch_ytchannel_with_ytchannelid(ytchannelid, p_session=None): if p_session is None: session = saconn.Session() else: session = p_session ytchannel = session.query(sam.YTChannelSA). \ filter(sam.YTChannelSA.ytchannelid == ytchannelid). \ first() if p_session is None: session.close() return ytchannel
def fetch_ytchannel_with_likenname(likenname, p_session=None): if p_session is None: session = saconn.Session() else: session = p_session ytchannel = session.query(sam.YTChannelSA). \ filter(sam.YTChannelSA.nname.like(likenname)). \ first() if p_session is None: session.close() return ytchannel
def fetch_all_channels_from_db(p_session): if p_session is None: session = saconn.Session() else: session = p_session dbytchannels = session.query(sam.YTChannelSA). \ order_by(sam.YTChannelSA.nname). \ all() if p_session is None: session.close() return dbytchannels
def fetch_all_active_ytchannels_in_db(p_session=None): if p_session is None: session = saconn.Session() else: session = p_session ytchannels = session.query(sam.YTChannelSA).\ filter(sam.YTChannelSA.active == 1).\ order_by(sam.YTChannelSA.nname).\ all() if p_session is None: session.close() return ytchannels
def verify_videopagefiles_w_no_corresponding_dbsubs(): """ About 800 pages were committed with the function below, ie they received missing subscriber numbers. There is one 'subscriber_number' per day per channel and some were missing within the last 3 months. However, after about 800 recups, there are still 22 missing, with scraperesult returning None; ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers; it's probably possible to treat them, picking up the 'museum' code. This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine. :return: """ count = 0 n_commits = 0 session = con.Session() for abspath in autof.generate_all_ytvideopages_abspath_asc_date(): strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath( abspath) subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\ filter(sam.YTDailySubscribersSA.infodate == strdate).\ first() if subs: continue count += 1 print(count, strdate, sname, ytchannelid, abspath) t_osstat = os.stat(abspath) timestamp = t_osstat.st_mtime dt = datetime.datetime.fromtimestamp(timestamp) filedate = dtfs.convert_datetime_to_date(dt) pdate = dtfs.get_refdate_from_strdate_or_none(strdate) if pdate != filedate: print('strdate', strdate, 'pdate', pdate, 'filedate', filedate, 'dt', dt) continue filetime = dtfs.extract_time_from_datetime(dt) text = open(abspath, encoding='utf8').read() n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext( text) # print('n_of_subscribers', n_of_subscribers) if n_of_subscribers is None: continue subs = sam.YTDailySubscribersSA() subs.ytchannelid = ytchannelid subs.infodate = pdate subs.infotime = filetime subs.subscribers = n_of_subscribers session.add(subs) n_commits += 1 print('n_commits', n_commits, 'committing', subs) session.commit() print('n_commits', n_commits, 'missing', count) session.close()
def list_channels_scrapedates(): sess = con.Session() likenname = '%plantão br%' ytchannel = fetcher.fetch_ytchannel_with_likenname(likenname, sess) print('list_channels_scrapedates ytchannel', ytchannel) if ytchannel is None: return c = 0 for subs in ytchannel.daily_subscribers: c += 1 if c > 3: break print('\t', subs.infodate, subs.subscribers) sess.close()
def get_abspaths_from_db(): base_abspaths = [] session = saconn.Session() newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key( ) treebase_abspath_recs = session.query(TreeBaseAbsPath). \ filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \ order_by(TreeBaseAbsPath.lookup_order). \ all() for treebase_abspath_rec in treebase_abspath_recs: base_abspath = treebase_abspath_rec.abspath base_abspaths.append(base_abspath) session.close() return base_abspaths
def process(): """ filepath = '/media/friend/SAMSUNG/Ytvideos BRA Politics/z Other ytchannels/000_scrape_ytdata/2020/2020-08' \ '/2020-08-13/2020-08-13 Plantão Br [cUC3-JLGJpMKwymQoRFJMkgSg].html' text = open(filepath, encoding='utf8').read() :return: """ ytchannel = dld1.get_channel_from_args() if ytchannel is None: sess = con.Session() ytchannel = sess.query(sam.YTChannelSA).\ filter(sam.YTChannelSA.nname.like(DEFAULT_LIKENAME_FOR_CHANNEL_FIND)).first() sess.close() ytvideopage = ytvpM.YtVideosPage(ytchannel.ytchannelid, ytchannel.nname) show(ytvideopage)
def get_newsarticles_base_abspath_from_db_lookup_strict( lookup_order=None, strict=True): base_abspath = None session = saconn.Session() newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key( ) treebase_abspath_rec = session.query(TreeBaseAbsPath). \ filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \ filter(TreeBaseAbsPath.lookup_order == lookup_order). \ first() if treebase_abspath_rec: if os.path.isdir(treebase_abspath_rec.abspath): base_abspath = treebase_abspath_rec.abspath session.close() if base_abspath is None and strict: error_msg = 'Error: The lookup_order (%s) does not exist in get_newsarticles_base_abspath_from_db_lookup_strict()' % str( lookup_order) raise ValueError(error_msg) return base_abspath
def look_up_empty_publishdatetime_values(): bool_deleted = False session = con.Session() vitems = session.query(sam.YTVideoItemInfoSA). \ filter(sam.YTVideoItemInfoSA.publishdatetime == None). \ all() n_of_deleted_rows = 0 print('count YTVideoItemInfoSA.publishdatetime', ) for i, vitem in enumerate(vitems): print(i+1, 'nviews', vitem.vviewlist.count(), vitem.publishdatetime, vitem.published_time_ago, vitem.infodate, vitem.title, vitem.ytchannel.nname) bool_deleted = True for e in vitem.vviewlist: n_of_deleted_rows += 1 session.delete(e) n_of_deleted_rows += 1 session.delete(vitem) if bool_deleted: print('Committing n_of_deleted_rows', n_of_deleted_rows) session.commit() session.close()
def get_newsarticles_base_abspath_from_db(lookup_order=None): if lookup_order is None or type(lookup_order) != int: lookup_order = 1 base_abspath = get_newsarticles_base_abspath_from_db_lookup_strict( lookup_order, strict=False) if base_abspath is None: # try first available, ie, one that exists in OS session = saconn.Session() newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key( ) treebase_abspath_recs = session.query(TreeBaseAbsPath). \ filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \ order_by(TreeBaseAbsPath.lookup_order). \ all() for treebase_abspath_rec in treebase_abspath_recs: if os.path.isdir(treebase_abspath_rec.abspath): base_abspath = treebase_abspath_rec.abspath break session.close() return base_abspath
def get_channel_from_args(confirm_msg_to_interpolate=None): if len(sys.argv) < 2: print(''' ================ Missing Parameter ================ Please, enter a nname or part of it to download a ytchannel. ================ ================ ================ ''') ans = input('Press any key and/or [ENTER] ') return None likename = sys.argv[1] session = con.Session() ytchannel = session.query(sam.YTChannelSA).filter( sam.YTChannelSA.nname.contains("%" + likename + "%")).first() session.close() if confirm_msg_to_interpolate is not None: confirm_msg = confirm_msg_to_interpolate % ytchannel.nname ans = input(confirm_msg) if ans not in ['', 'Y', 'y']: return None return ytchannel
def update_subscribers_scrape_for_date(refdate): global g_nsubs_is_none sess = con.Session() ytchannelids_n_cdatetimes = autof.find_ytchannelid_n_videopagefilemodifiedtimestamp_tuplelist_for_date( refdate) for ytchannelid_n_cdatetime in ytchannelids_n_cdatetimes: ytchannelid, cdatetime = ytchannelid_n_cdatetime ytchannel = fetcher.fetch_ytchannel_with_ytchannelid(ytchannelid, sess) if ytchannel is None: continue dt = datetime.datetime.fromtimestamp(cdatetime) pdate = dtfs.convert_datetime_to_date(dt) n_of_subs = scrape_channel_from_its_datedfile_n_return_number_of_subscribers( ytchannel, pdate) if n_of_subs is None: g_nsubs_is_none += 1 print(g_nsubs_is_none, 'n_of_subs is None') continue print(ytchannel.nname, refdate, n_of_subs, dt) update_subscribers(ytchannel, refdate, n_of_subs, dt, sess) sess.close()
def list_ytchannel_videos_less_than_nmin(ytchannelid=None, nmin=None): if ytchannelid is None: ytchannelid = 'ueduardoamoreira' if nmin is None: nmin = DEFAULT_SHORTIVIDEO_DURATION_CAP_IN_MIN if type(nmin) != int: try: nmin = int(nmin) except ValueError: nmin = DEFAULT_SHORTIVIDEO_DURATION_CAP_IN_MIN session = con.Session() nsec = nmin * 60 ytchannel = session.query(sam.YTChannelSA).filter( sam.YTChannelSA.ytchannelid == ytchannelid).first() print(ytchannel) ytvideoids = [] for i, vinfo in enumerate(ytchannel.vinfolist): if vinfo.duration_in_sec <= nsec: hms = dtfs.transform_duration_in_sec_into_hms( vinfo.duration_in_sec) print(i + 1, hms, vinfo.title) ytvideoids.append(vinfo.ytvideoid) for ytvideoid in ytvideoids: print(ytvideoid)
def get_all_ytchannelids(): session = saconn.Session() ytchannels = session.query(YTChannelSA).order_by(YTChannelSA.nname).all() ytchannelids = list(map(lambda o: o.ytchannelid, ytchannels)) session.close() return ytchannelids
def list_last_subs(): session = con.Session() ytchannels = fetcher.fetch_all_active_ytchannels_in_db(session) for ytchannel in ytchannels: list_last_subscribers_for_ytchannel(ytchannel, session) session.close()