Esempio n. 1
0
def transport_osdatetime_to_null_infotime_values_in_subscribers():
    global g_nsubs_is_none, gcount
    session = con.Session()
    subs = session.query(sam.YTDailySubscribersSA).\
      filter(sam.YTDailySubscribersSA.infotime == None).\
      order_by(sam.YTDailySubscribersSA.infodate).\
      all()
    for i, sub in enumerate(subs):
        sname = sub.ytchannel.sname
        filename = autof.form_datedpage_filename_with_triple(
            sub.infodate, sname, sub.ytchannelid)
        filepath = autof.form_datedpage_filepath_with_triple(
            sub.infodate, sname, sub.ytchannelid)
        print(sname, sub, 'infotime', sub.infotime)
        osstat_nt = os.stat(filepath)
        mtime = osstat_nt.st_mtime
        dt = datetime.datetime.fromtimestamp(mtime)
        print(i + 1, dt, filepath)
        pdate = dtfs.convert_datetime_to_date(dt)
        ptime = dtfs.extract_time_from_datetime(dt)
        if sub.infodate != pdate:
            g_nsubs_is_none += 1
            line = '============= sub.infodate %s != pdate %s =============' % (
                sub.infodate, pdate)
            print(line)
            continue
        gcount += 1
        sub.infotime = ptime

    print('g_nsubs_is_none', g_nsubs_is_none)
    print('gcount', gcount)
    session.commit()
    session.close()
def look_up_dates_data():
  session = con.Session()
  inidate, findate = autof.find_dateini_n_datefin_thru_yyyymmdd_level3_folders()
  for refdate in dtfs.generate_daterange_with_dateini_n_datefin(inidate, findate):
    look_up_modifiedtime_of_htmlvideopagefiles(refdate, session)
  session.close()
  print(inidate, findate)
Esempio n. 3
0
 def insert(self):
     shouldbe_abspath = os.path.join(self.canon_filefolder_abspath,
                                     self.filename)
     if not os.path.isfile(shouldbe_abspath):
         print('filename', self.filename, 'is not in folder',
               shouldbe_abspath)
         return False
     digest = self.get_n_set_sha1_for_file_on_datedfolder_or_None()
     if digest is None:
         return False
     session = saconn.Session()
     newsarticle = session.query(NewsArticlesSA).filter(
         NewsArticlesSA.sha1 == digest).first()
     if newsarticle:
         print('newsarticle with sha1 %s already in db' % str(digest))
         session.close()
         return False
     newsarticle = NewsArticlesSA()
     newsarticle.title = self.filename
     newsarticle.filename = self.filename
     newsarticle.sha1 = self.sha1
     newsarticle.publishdate = self.refdate
     print("Addimg ", newsarticle)
     session.add(newsarticle)
     session.commit()
     session.close()
     return True
Esempio n. 4
0
def show_dates_n_times_of_videoitems_table():
    session = con.Session()
    videitems = session.query(sam.YTVideoItemInfoSA).all()
    seq = 0
    field_names = [
        'seq', 'ytvideoid', 'nname', 'titletrunc', 'infodate', 'infotime',
        'publishdatetime', 'calendarStr', 'created_at'
    ]
    ptab = remake_ptab(field_names)
    for vitem in videitems:
        seq += 1
        ptab.add_row([
            seq,
            vitem.ytvideoid,
            vitem.ytchannel.nname,
            vitem.infodate,
            vitem.infotime,
            vitem.publishdatetime,
            vitem.published_time_ago,
            vitem.created_at,
        ])
        if seq % 40 == 0:
            print(ptab)
            ptab = remake_ptab(field_names)
        # if seq > 400:
        # break
    print(ptab)
def scrape_ytchannel():
    session = con.Session()
    ytchannel = session.query(sam.YTChannelSA).filter(
        sam.YTChannelSA.nname.like('%plantão br%')).first()
    print(ytchannel)
    drill.extract_videoitems_from_videopage(ytchannel.ytchannelid,
                                            '2020-08-21')
def look_up_empty_infotime_values():
  """

  :return:
  """
  bool_deleted = False
  session = con.Session()
  vviews = session.query(sam.YTVideoViewsSA). \
    filter(sam.YTVideoViewsSA.infotime == None). \
    all()
  n_of_vviews_with_null_infotime = len(vviews)
  print('count vviews', )
  for i, vview in enumerate(vviews):
    print(i+1, vview.infodate, vview.views, vview.ytvideo.title, vview.ytvideo.ytchannel.nname)
    bool_deleted = True
    session.delete(vview)
  if bool_deleted:
    print('records vviews deleted =',  n_of_vviews_with_null_infotime)
    session.commit()

  vitems = session.query(sam.YTVideoItemInfoSA). \
    filter(sam.YTVideoItemInfoSA.infotime == None). \
    all()
  n_of_vitems_with_null_infotime = len(vitems)
  print('count vitews', len(vitems))
  for i, vitem in enumerate(vitems):
    print(i+1, vitem.infodate, vitem.ytchannel.nname)
    # bool_deleted = True
    # session.delete(vitem)
  if bool_deleted:
    total = n_of_vitems_with_null_infotime + n_of_vviews_with_null_infotime
    print('records deleted =', total, 'vviews', n_of_vviews_with_null_infotime, 'vitems', n_of_vitems_with_null_infotime)

  session.close()
Esempio n. 7
0
def find_by_likename(likename):
    session = con.Session()
    ytchannel = session.query(sam.YTChannelSA).\
        filter(sam.YTChannelSA.nname.like("%" + likename + "%")).first()
    # print(ytchannel)
    session.close()
    return ytchannel
def get_most_recent_video_for(ytchannel):
    session = con.Session()
    vinfo = session.query(sam.YTVideoItemInfoSA). \
        filter(sam.YTVideoItemInfoSA.ytchannelid == ytchannel.ytchannelid).\
        order_by(desc(sam.YTVideoItemInfoSA.publishdatetime)).\
        first()
    session.close()
    return vinfo
Esempio n. 9
0
def process_fetch():
    session = con.Session()
    confirm_msg_to_interpolate = None
    ytchannel, nminutes = get_channel_from_args_with_open_session(
        session, confirm_msg_to_interpolate)
    print('ytchannel', ytchannel)
    for i, vitem in enumerate(ytchannel.vinfolist):
        if vitem.duration_in_sec and vitem.duration_in_sec < nminutes * 60 + 1:
            print(vitem.ytvideoid, i + 1, vitem.duration_in_hms, vitem.title)
    session.close()
def fetch_sname_with_ytchannelid(ytchannelid):
    session = saconn.Session()
    ytchannel = session.query(sam.YTChannelSA.nname). \
        filter(sam.YTChannelSA.ytchannelid == ytchannelid). \
        first()
    if ytchannel is not None:
        sname = ytchannel.sname
        return sname
    session.close()
    return None
def fetch_ytchannel_with_ytchannelid(ytchannelid, p_session=None):
    if p_session is None:
        session = saconn.Session()
    else:
        session = p_session
    ytchannel = session.query(sam.YTChannelSA). \
        filter(sam.YTChannelSA.ytchannelid == ytchannelid). \
        first()
    if p_session is None:
        session.close()
    return ytchannel
def fetch_ytchannel_with_likenname(likenname, p_session=None):
    if p_session is None:
        session = saconn.Session()
    else:
        session = p_session
    ytchannel = session.query(sam.YTChannelSA). \
        filter(sam.YTChannelSA.nname.like(likenname)). \
        first()
    if p_session is None:
        session.close()
    return ytchannel
def fetch_all_channels_from_db(p_session):
    if p_session is None:
        session = saconn.Session()
    else:
        session = p_session
    dbytchannels = session.query(sam.YTChannelSA). \
        order_by(sam.YTChannelSA.nname). \
        all()
    if p_session is None:
        session.close()
    return dbytchannels
def fetch_all_active_ytchannels_in_db(p_session=None):
    if p_session is None:
        session = saconn.Session()
    else:
        session = p_session
    ytchannels = session.query(sam.YTChannelSA).\
      filter(sam.YTChannelSA.active == 1).\
      order_by(sam.YTChannelSA.nname).\
      all()
    if p_session is None:
        session.close()
    return ytchannels
Esempio n. 15
0
def verify_videopagefiles_w_no_corresponding_dbsubs():
    """
  About 800 pages were committed with the function below, ie they received missing subscriber numbers.
    There is one 'subscriber_number' per day per channel and some were missing within the last 3 months.
  However, after about 800 recups, there are still 22 missing, with scraperesult returning None;
    ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers;
    it's probably possible to treat them, picking up the 'museum' code.
  This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine.
  :return:
  """
    count = 0
    n_commits = 0
    session = con.Session()
    for abspath in autof.generate_all_ytvideopages_abspath_asc_date():
        strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath(
            abspath)
        subs = session.query(sam.YTDailySubscribersSA).\
          filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\
          filter(sam.YTDailySubscribersSA.infodate == strdate).\
          first()
        if subs:
            continue
        count += 1
        print(count, strdate, sname, ytchannelid, abspath)
        t_osstat = os.stat(abspath)
        timestamp = t_osstat.st_mtime
        dt = datetime.datetime.fromtimestamp(timestamp)
        filedate = dtfs.convert_datetime_to_date(dt)
        pdate = dtfs.get_refdate_from_strdate_or_none(strdate)
        if pdate != filedate:
            print('strdate', strdate, 'pdate', pdate, 'filedate', filedate,
                  'dt', dt)
            continue
        filetime = dtfs.extract_time_from_datetime(dt)
        text = open(abspath, encoding='utf8').read()
        n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext(
            text)
        # print('n_of_subscribers', n_of_subscribers)
        if n_of_subscribers is None:
            continue
        subs = sam.YTDailySubscribersSA()
        subs.ytchannelid = ytchannelid
        subs.infodate = pdate
        subs.infotime = filetime
        subs.subscribers = n_of_subscribers
        session.add(subs)
        n_commits += 1
        print('n_commits', n_commits, 'committing', subs)
        session.commit()
    print('n_commits', n_commits, 'missing', count)
    session.close()
Esempio n. 16
0
def list_channels_scrapedates():
    sess = con.Session()
    likenname = '%plantão br%'
    ytchannel = fetcher.fetch_ytchannel_with_likenname(likenname, sess)
    print('list_channels_scrapedates ytchannel', ytchannel)
    if ytchannel is None:
        return
    c = 0
    for subs in ytchannel.daily_subscribers:
        c += 1
        if c > 3:
            break
        print('\t', subs.infodate, subs.subscribers)
    sess.close()
Esempio n. 17
0
def get_abspaths_from_db():
    base_abspaths = []
    session = saconn.Session()
    newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key(
    )
    treebase_abspath_recs = session.query(TreeBaseAbsPath). \
      filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \
      order_by(TreeBaseAbsPath.lookup_order). \
      all()
    for treebase_abspath_rec in treebase_abspath_recs:
        base_abspath = treebase_abspath_rec.abspath
        base_abspaths.append(base_abspath)
    session.close()
    return base_abspaths
Esempio n. 18
0
def process():
  """
    filepath = '/media/friend/SAMSUNG/Ytvideos BRA Politics/z Other ytchannels/000_scrape_ytdata/2020/2020-08' \
                '/2020-08-13/2020-08-13 Plantão Br [cUC3-JLGJpMKwymQoRFJMkgSg].html'
    text = open(filepath, encoding='utf8').read()

  :return:
  """
  ytchannel = dld1.get_channel_from_args()
  if ytchannel is None:
    sess = con.Session()
    ytchannel = sess.query(sam.YTChannelSA).\
        filter(sam.YTChannelSA.nname.like(DEFAULT_LIKENAME_FOR_CHANNEL_FIND)).first()
    sess.close()
  ytvideopage = ytvpM.YtVideosPage(ytchannel.ytchannelid, ytchannel.nname)
  show(ytvideopage)
Esempio n. 19
0
def get_newsarticles_base_abspath_from_db_lookup_strict(
        lookup_order=None, strict=True):
    base_abspath = None
    session = saconn.Session()
    newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key(
    )
    treebase_abspath_rec = session.query(TreeBaseAbsPath). \
      filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \
      filter(TreeBaseAbsPath.lookup_order == lookup_order). \
      first()
    if treebase_abspath_rec:
        if os.path.isdir(treebase_abspath_rec.abspath):
            base_abspath = treebase_abspath_rec.abspath
    session.close()
    if base_abspath is None and strict:
        error_msg = 'Error: The lookup_order (%s) does not exist in get_newsarticles_base_abspath_from_db_lookup_strict()' % str(
            lookup_order)
        raise ValueError(error_msg)
    return base_abspath
def look_up_empty_publishdatetime_values():
  bool_deleted = False
  session = con.Session()
  vitems = session.query(sam.YTVideoItemInfoSA). \
    filter(sam.YTVideoItemInfoSA.publishdatetime == None). \
    all()
  n_of_deleted_rows = 0
  print('count YTVideoItemInfoSA.publishdatetime', )
  for i, vitem in enumerate(vitems):
    print(i+1, 'nviews', vitem.vviewlist.count(), vitem.publishdatetime, vitem.published_time_ago, vitem.infodate, vitem.title, vitem.ytchannel.nname)
    bool_deleted = True
    for e in vitem.vviewlist:
      n_of_deleted_rows += 1
      session.delete(e)
    n_of_deleted_rows += 1
    session.delete(vitem)
  if bool_deleted:
    print('Committing n_of_deleted_rows', n_of_deleted_rows)
    session.commit()
  session.close()
Esempio n. 21
0
def get_newsarticles_base_abspath_from_db(lookup_order=None):
    if lookup_order is None or type(lookup_order) != int:
        lookup_order = 1
    base_abspath = get_newsarticles_base_abspath_from_db_lookup_strict(
        lookup_order, strict=False)
    if base_abspath is None:
        # try first available, ie, one that exists in OS
        session = saconn.Session()
        newsarticles_basefoldername_as_key = get_newsarticles_basefoldername_as_key(
        )
        treebase_abspath_recs = session.query(TreeBaseAbsPath). \
          filter(TreeBaseAbsPath.app_tree_strkey==newsarticles_basefoldername_as_key). \
          order_by(TreeBaseAbsPath.lookup_order). \
          all()
        for treebase_abspath_rec in treebase_abspath_recs:
            if os.path.isdir(treebase_abspath_rec.abspath):
                base_abspath = treebase_abspath_rec.abspath
                break
        session.close()
    return base_abspath
def get_channel_from_args(confirm_msg_to_interpolate=None):
    if len(sys.argv) < 2:
        print('''
    ================ Missing Parameter ================ 
    Please, enter a nname or part of it to download a ytchannel.
    ================ ================  ================ 
    ''')
        ans = input('Press any key and/or [ENTER] ')
        return None
    likename = sys.argv[1]
    session = con.Session()
    ytchannel = session.query(sam.YTChannelSA).filter(
        sam.YTChannelSA.nname.contains("%" + likename + "%")).first()
    session.close()
    if confirm_msg_to_interpolate is not None:
        confirm_msg = confirm_msg_to_interpolate % ytchannel.nname
        ans = input(confirm_msg)
        if ans not in ['', 'Y', 'y']:
            return None
    return ytchannel
Esempio n. 23
0
def update_subscribers_scrape_for_date(refdate):
    global g_nsubs_is_none
    sess = con.Session()
    ytchannelids_n_cdatetimes = autof.find_ytchannelid_n_videopagefilemodifiedtimestamp_tuplelist_for_date(
        refdate)
    for ytchannelid_n_cdatetime in ytchannelids_n_cdatetimes:
        ytchannelid, cdatetime = ytchannelid_n_cdatetime
        ytchannel = fetcher.fetch_ytchannel_with_ytchannelid(ytchannelid, sess)
        if ytchannel is None:
            continue
        dt = datetime.datetime.fromtimestamp(cdatetime)
        pdate = dtfs.convert_datetime_to_date(dt)
        n_of_subs = scrape_channel_from_its_datedfile_n_return_number_of_subscribers(
            ytchannel, pdate)
        if n_of_subs is None:
            g_nsubs_is_none += 1
            print(g_nsubs_is_none, 'n_of_subs is None')
            continue
        print(ytchannel.nname, refdate, n_of_subs, dt)
        update_subscribers(ytchannel, refdate, n_of_subs, dt, sess)
    sess.close()
Esempio n. 24
0
def list_ytchannel_videos_less_than_nmin(ytchannelid=None, nmin=None):
    if ytchannelid is None:
        ytchannelid = 'ueduardoamoreira'
    if nmin is None:
        nmin = DEFAULT_SHORTIVIDEO_DURATION_CAP_IN_MIN
    if type(nmin) != int:
        try:
            nmin = int(nmin)
        except ValueError:
            nmin = DEFAULT_SHORTIVIDEO_DURATION_CAP_IN_MIN
    session = con.Session()
    nsec = nmin * 60
    ytchannel = session.query(sam.YTChannelSA).filter(
        sam.YTChannelSA.ytchannelid == ytchannelid).first()
    print(ytchannel)
    ytvideoids = []
    for i, vinfo in enumerate(ytchannel.vinfolist):
        if vinfo.duration_in_sec <= nsec:
            hms = dtfs.transform_duration_in_sec_into_hms(
                vinfo.duration_in_sec)
            print(i + 1, hms, vinfo.title)
            ytvideoids.append(vinfo.ytvideoid)
    for ytvideoid in ytvideoids:
        print(ytvideoid)
def get_all_ytchannelids():
    session = saconn.Session()
    ytchannels = session.query(YTChannelSA).order_by(YTChannelSA.nname).all()
    ytchannelids = list(map(lambda o: o.ytchannelid, ytchannels))
    session.close()
    return ytchannelids
Esempio n. 26
0
def list_last_subs():
    session = con.Session()
    ytchannels = fetcher.fetch_all_active_ytchannels_in_db(session)
    for ytchannel in ytchannels:
        list_last_subscribers_for_ytchannel(ytchannel, session)
    session.close()