Beispiel #1
0
    def deepRun(self):
        print('DeepRunStart')
        getchannel = SubscriptList(datalist, SavePath)
        datalist['exsistFile'] = SavePath['channel_ListPath']
        channelList = Integrate.readchannelList()
        exisitList = Integrate.getsuccessList()
        RunCount = 0
        for key in channelList:
            RunCount += 1
            print('DeepRun_Status: %.2f' % (RunCount / len(channelList) * 100))
            try:
                if key not in exisitList:
                    datalist['channelID'] = key
                    getchannel.getSubscriptList()
                else:
                    pass
            except Exception as identifier:
                Error = str(identifier)
                if Error.find('404', 0, 20) != -1:
                    print('Error_404未找到位置(檢查網路問題)')
                elif Error.find('304', 0, 20) != -1:
                    print('Error_304')
                elif Error.find('403', 0, 20) != -1:
                    # print('Error_403無法存取資料')
                    pass
                else:
                    # print(identifier)
                    pass

            else:
                pass
            finally:
                pass
        Integrate.TotalchannelList()
        print('DeepRunEnd')
    def getchannelList(self):
        datalist['exsistFile'] = SavePath['channel_ListPath']
        exisitList = Integrate.getsuccessList()
        getchannel = SubscriptList(datalist, SavePath)
        channelList = Integrate.readchannelList()
        random.shuffle(channelList)

        RunCount = 0
        RequestCount = 0
        RefreshCount = 0
        BeforeRunCount = 0
        listcount = len(channelList)

        Tstart = time.time()
        ProcessStart = time.time()
        for key in channelList:
            RunCount += 1
            try:
                if key not in exisitList:
                    datalist['channelID'] = key
                    getchannel.getSubscriptList()
                    RequestCount += 1
                else:
                    # print('IDexist')
                    pass
            except Exception as identifier:
                Error = str(identifier)
                if Error.find('404', 0, 20) != -1:
                    print('Error_404未找到位置(檢查網路問題)')
                elif Error.find('304', 0, 20) != -1:
                    print('Error_304')
                elif Error.find('403', 0, 20) != -1:
                    # print('Error_403無法存取資料')
                    pass
                else:
                    pass
                    # print(identifier)
            finally:
                pass

            Tend = time.time()
            if ((Tend - Tstart) > datalist['RefreshTime']):
                exisitList = Integrate.getsuccessList()
                Basic.runstatus(self, RunCount, RequestCount, Tstart, Tend,
                                RefreshCount, ProcessStart, BeforeRunCount,
                                listcount)
                RequestCount = 0
                RefreshCount = 0
                BeforeRunCount = RunCount
                Tstart = time.time()
            else:
                RefreshCount += 1
        ProcessEnd = time.time()
        print('GetChannel_List_Done')
        print('ProcessTimeCost : %.4f' % (ProcessEnd - ProcessStart))
        Integrate.TotalchannelList()
    def getFiltedVideolid(self):
        getvideolist = VideoList(datalist, SavePath)
        datalist['exsistFile'] = SavePath['playlist_ListPath']
        print('Count:' + str(len(Integrate.readFiltedPlayList())))
        RunCount = 0
        RequestCount = 0
        RefreshCount = 0
        BeforeRunCount = 0
        exisitList = Integrate.getsuccessList()
        FiltedPlayList = Integrate.readFiltedPlayList()
        listcount = len(FiltedPlayList)

        random.seed(datetime.utcnow())
        random.shuffle(FiltedPlayList)
        ProcessStart = time.time()
        Tstart = time.time()

        for key in FiltedPlayList:
            RunCount += 1
            if key not in exisitList:
                try:
                    datalist['playlistId'] = key
                    getvideolist.getvideolist()
                    RequestCount += 1
                except Exception as identifier:
                    Error = str(identifier)
                    if Error.find('404', 0, 20) != -1:
                        print('Error_404無法找到相關影片資訊')
                    elif Error.find('304', 0, 20) != -1:
                        print('Error_304')
                    elif Error.find('403', 0, 20) != -1:
                        # print('Error_403無法存取資料')
                        pass
                    else:
                        print(identifier)
            else:
                # print('IDexist')
                pass

            Tend = time.time()

            if ((Tend - Tstart) > datalist['RefreshTime']):
                exisitList = Integrate.getsuccessList()
                Basic.runstatus(self, RunCount, RequestCount, Tstart, Tend,
                                RefreshCount, ProcessStart, BeforeRunCount,
                                listcount)
                RequestCount = 0
                RefreshCount = 0
                BeforeRunCount = RunCount
                Tstart = time.time()
            else:
                RefreshCount += 1

        ProcessEnd = time.time()
        print('GetVideoList_Done')
        print('ProcessTimeCost : %.4f /s' % (ProcessEnd - ProcessStart))
    def analysechannel(self, channelID):
        OldVersion.jieba.analyse.set_idf_path('./jieba/analyse/idfbig.txt')
        jieba.load_userdict('./jieba/dictbig.txt')
        jieba.set_dictionary('./jieba/dictbig.txt')
        Integrate(self.datalist, self.SavePath)
        tag = (OldVersion.jieba.analyse.extract_tags(
            Integrate.readchannelcomment(self, channelID),
            topK=200,
            withWeight=False,
            allowPOS=()))

        tag = Jiebatext.stopword(self, tag)
        return tag
Beispiel #5
0
    def videoinfo_instdb(self, ChannelID):
        loadvideolist = Integrate.readchannelPlayList(self)
        filekey = []
        filevalue = []
        for temp in loadvideolist:
            filekey.append(temp)
            for videoid in temp:
                filevalue.append(videoid)

        Field = [
            'SquenceHash', 'ChannelID', 'VideoID', 'CategoryID',
            'VideoDescription', 'liveBroadcastContent', 'Titel', 'PublishedAt',
            'tags', 'Thumbnails', 'RequestDate', 'UpdateDate'
        ]

        db_videolist = SQLFunction.raw_Search(
            self, 'select VideoID From ChannelVideoList')
        VideoID = []
        for row in db_videolist:
            VideoID.append(row[0])
        diffvideoid = list(set(filevalue).difference(set(VideoID)))

        for channelid in loadvideolist:
            for videoid in loadvideolist[channelid]:
                if videoid not in diffvideoid:
                    value = [channelid, videoid]

                    try:
                        SQLFunction(self.DBsetting)
                        SQLFunction.SquenVideoListInst(self,
                                                       'ChannelVideoList',
                                                       Field, value)
                    except Exception as identifier:
                        print(identifier)
                        pass
Beispiel #6
0
    def SquenVideoListInstDB(self):
        loadvideolist = Integrate.readchannelPlayList(self)
        filekey = []
        filevalue = []
        for temp in loadvideolist:
            filekey.append(temp)
            for videoid in temp:
                filevalue.append(videoid)

        Field = [
            'ChannelID',
            'VideoID',
        ]

        db_videolist = SQLFunction.raw_Search(
            self, 'select VideoID From ChannelVideoList')
        VideoID = []
        for row in db_videolist:
            VideoID.append(row[0])
        diffvideoid = list(set(filevalue).difference(set(VideoID)))

        for channelid in loadvideolist:
            for videoid in loadvideolist[channelid]:
                if videoid not in diffvideoid:
                    value = [channelid, videoid]

                    try:
                        SQLFunction(self.DBsetting)
                        SQLFunction.SquenVideoListInst(self,
                                                       'ChannelVideoList',
                                                       Field, value)
                    except Exception as identifier:
                        print(identifier)
                        pass
    def getFiltedCommentTop(self):
        getcomment = CommentThreads(datalist, SavePath)
        datalist['id'] = Integrate.readchannelVideoID()
        datalist['exsistFile'] = SavePath['channelcomment_ListPath']
        exisitList = Integrate.getsuccessList()
        filtedvideoid = Integrate.readchannelVideoID()

        random.seed(datetime.utcnow())
        random.shuffle(filtedvideoid)
        RunCount = 0
        BeforeRunCount = 0
        RequestCount = 0
        RefreshCount = 0

        videolistID = []
        ProcessStart = time.time()
        Tstart = time.time()
        for temp in filtedvideoid:
            for videoid in temp:
                videolistID.append(videoid)
        listcount = len(videolistID)
        print('VideoCount:' + str(listcount))

        for temp in videolistID:
            if temp not in exisitList:
                RunCount += 1
                datalist['id'] = temp
                getcomment.getcomment()
                RequestCount += 1
            else:
                pass
                # print('IDexsist')
            Tend = time.time()
            if ((Tend - Tstart) > datalist['RefreshTime']):
                exisitList = Integrate.getsuccessList()
                Basic.runstatus(self, RunCount, RequestCount, Tstart, Tend,
                                RefreshCount, ProcessStart, BeforeRunCount,
                                listcount)
                RequestCount = 0
                RefreshCount = 0
                BeforeRunCount = RunCount
                Tstart = time.time()
            else:
                RefreshCount += 1
        ProcessEnd = time.time()
        print('GetVideoComment_Done')
        print('ProcessTimeCost : %.4f /s' % (ProcessEnd - ProcessStart))
    def jiebaweight(self, channelID):
        text = Integrate.readchannelVideotag_traget(self, channelID)

        cut = Jiebatext.cut_foucus(self, str(text))

        analyze = Jiebatext.analyse(self, str(cut), 10)

        print(analyze)
        return analyze
 def getFiltedVideolInfo(self):
     getvideoinfo = VideoInfo(datalist, SavePath)
     datalist['id'] = Integrate.readchannelVideoID()
     datalist['exsistFile'] = SavePath['video_InfoPath']
     videolist = datalist['id']
     videolistID = []
     RunCount = 0
     BeforeRunCount = 0
     RequestCount = 0
     RefreshCount = 0
     exisitList = Integrate.getsuccessList()
     ProcessStart = time.time()
     Tstart = time.time()
     for temp in videolist:
         for videoid in temp:
             videolistID.append(videoid)
     listcount = len(videolistID)
     print('VideoCount:' + str(listcount))
     random.seed(datetime.utcnow())
     random.shuffle(videolistID)
     for videoid in videolistID:
         RunCount += 1
         if videoid not in exisitList:
             datalist['id'] = videoid
             getvideoinfo.getVideoInfo()
             RequestCount += 1
         else:
             pass
             # print('IDexsist')
         Tend = time.time()
         if ((Tend - Tstart) > datalist['RefreshTime']):
             exisitList = Integrate.getsuccessList()
             Basic.runstatus(self, RunCount, RequestCount, Tstart, Tend,
                             RefreshCount, ProcessStart, BeforeRunCount,
                             listcount)
             RequestCount = 0
             RefreshCount = 0
             BeforeRunCount = RunCount
             Tstart = time.time()
         else:
             RefreshCount += 1
     ProcessEnd = time.time()
     print('GetVideoInfo_Done')
     print('ProcessTimeCost : %.4f /s' % (ProcessEnd - ProcessStart))
Beispiel #10
0
    def firstRun(self):
        getchannel = SubscriptList(datalist, SavePath)
        try:
            getchannel.getSubscriptList()
        except Exception as identifier:
            Error = str(identifier)
            if Error.find('404', 0, 20) != -1:
                print('Error_404未找到位置(檢查網路問題)')
            elif Error.find('304', 0, 20) != -1:
                print('Error_304')
            elif Error.find('403', 0, 20) != -1:
                print('Error_403無法存取資料')
                pass

        Integrate.TotalchannelList()
    def getchannelInfo(self):
        Integrate.TotalchannelList()
        Info = ChannelInfo(datalist, SavePath)
        datalist['exsistFile'] = SavePath['channel_InfoPath']
        print('Count:' + str(len(Integrate.readchannelList())))
        RunCount = 0
        RequestCount = 0
        RefreshCount = 0
        BeforeRunCount = 0

        exisitList = Integrate.getsuccessList()
        channelList = Integrate.readchannelList()
        listcount = len(channelList)
        random.shuffle(channelList)
        ProcessStart = time.time()
        Tstart = time.time()

        for key in channelList:
            RunCount += 1
            if key not in exisitList:
                try:
                    datalist['id'] = key
                    Info.squent_getInfo()
                    RequestCount += 1
                except Exception as identifier:
                    print(identifier)
            else:
                # print('IDexist')
                pass

            Tend = time.time()

            if ((Tend - Tstart) > datalist['RefreshTime']):
                exisitList = Integrate.getsuccessList()
                Basic.runstatus(self, RunCount, RequestCount, Tstart, Tend,
                                RefreshCount, ProcessStart, BeforeRunCount,
                                listcount)
                RequestCount = 0
                RefreshCount = 0
                BeforeRunCount = RunCount
                Tstart = time.time()
            else:
                RefreshCount += 1

        Integrate.TotalchannelList()
        Integrate.TotalchannelInfo()
        ProcessEnd = time.time()
        print('GetChannel_Info_Done')
        print('ProcessTimeCost : %.4f /s' % (ProcessEnd - ProcessStart))
    def TFweight(self, channelID):
        keys = []
        values = []
        text = Integrate.readchannelVideotag_traget(self, channelID)
        wordcount = nltk.FreqDist(text)
        # print(wordcount.items())
        print('TagCount:%.d' % sum(wordcount.values()))
        for key in wordcount.keys():
            if ((wordcount[key] / sum(wordcount.values())) >= 0.01):
                keys.append(key)
                values.append(
                    round(wordcount[key] / sum(wordcount.values()), 4))
        wordwidth = dict(zip(keys, values))
        # d3 = {k:v for k,v in wordwidth.items() if v > 0.01 }
        # print(list(d3))
        #wordwidth = list(filter(lambda x: wordwidth[x] >= 0.01, wordwidth))
        print(wordwidth)

        return wordwidth
 def getchannelVideoInfo_traget(self, channelID):
     Integrate.classifychannelVideoinfo_traget(channelID)
    with open('config/FilterValue.json', 'r') as FilterValue:
        FilterValue = json.loads(FilterValue.read())
    with open('config/DBsetting.json', 'r') as DBsetting:
        DBsetting = json.loads(DBsetting.read())
    with open('config/category.json', 'r') as category:
        category = json.loads(category.read())
    print('config checked!')
except Exception as identifier:
    print(identifier)

Process = Process(datalist, FilterValue, DBsetting, SavePath)
Process.initialize_check()
Focusmod = Focus(datalist, FilterValue, DBsetting, SavePath)
FileUpdate = FileUpdate(datalist, FilterValue, DBsetting, SavePath)

Integrate = Integrate(datalist, SavePath)
Single = Single(datalist, SavePath)

# Integrate.TotalchannelList()
# Integrate.TotalchannelInfo()
# Integrate.TotalVideoList()
# Integrate.TotalvideoInfo()
# Integrate.readchannelInfo()
# Integrate.readFiltedPlayList()
# Integrate.readchannelList()
# Integrate.readchannelVideoID()
# Integrate.readchannelVideotag_traget("UCO3r3FllELijijdytnR43NA")

# comment_text=Integrate.readchannelcomment('UC24h-JBUHXT5HXIK_9cWOmQ')
# Jiebatext=Jiebatext(datalist,SavePath)
# comment_text=open('../jiebacut/'+'mycreate'+'.txt', 'r' ,encoding = 'utf8')
Beispiel #15
0
    def BatchVideoListInstDB(self):
        loadvideolist = Integrate.readchannelPlayList(self)
        filekey = []
        filevalue = []
        for temp in loadvideolist:
            filekey.append(temp)
            for videoid in loadvideolist[temp]:
                filevalue.append(videoid)

        Field = [
            'ChannelID',
            'VideoID',
        ]
        db_videolist = SQLFunction.raw_Search(
            self, 'select VideoID From ChannelVideoList')
        VideoID = []
        for row in db_videolist:
            VideoID.append(row[0])
        diffvideoid = list(set(filevalue).difference(set(VideoID)))
        print('jsonID' + str(len(filevalue)))
        print('db_exsistID' + str(len(db_videolist)))
        print(len(diffvideoid))
        channelid_list = []
        videoid_list = []

        ProcessStart = time.time()
        Tstart = time.time()
        RunCount = 0
        BeforeRunCount = 0

        for channelid in loadvideolist:
            for videoid in loadvideolist[channelid]:
                RunCount += 1
                if videoid in diffvideoid:
                    channelid_list.append(channelid)
                    videoid_list.append(videoid)
                else:
                    pass

                # channelid_list.append(channelid)
                # videoid_list.append(videoid)

                Tend = time.time()
                if (Tend - Tstart) >= self.DBsetting['RefreshTime']:
                    print('status %.3f ' % ((RunCount / len(filevalue)) * 100))
                    print('Speed: %.d /s' % ((RunCount - BeforeRunCount) /
                                             self.DBsetting['RefreshTime']))
                    print('NeedTime:%.2f /min  already: %d s' %
                          ((((len(filevalue) /
                              (((RunCount - BeforeRunCount)) /
                               self.DBsetting['RefreshTime'])) -
                             ((Tend - ProcessStart))) / 60),
                           (Tend - ProcessStart)))
                    print()
                    Tstart = time.time()
                    BeforeRunCount = RunCount

        value = [channelid_list, videoid_list]
        print(len(channelid_list))
        print(len(videoid_list))
        try:
            SQLFunction(self.DBsetting)
            SQLFunction.BatchVideoListInst(self, 'ChannelVideoList', Field,
                                           channelid_list, videoid_list)
        except Exception as identifier:
            print(identifier)
            pass
    def train(self, channelID):
        text = Integrate.readchannelVideotag_traget(self, channelID)
        cut = Jiebatext.cut_all(self, str(text))

        with open("./jieba/stop_words.txt") as f:
            stop_word_content = f.readlines()
        stop_word_content = [x.strip() for x in stop_word_content]
        stop_word_content = " ".join(stop_word_content)

        dictionary = corpora.Dictionary(document.split() for document in cut)

        stoplist = set(stop_word_content.split())
        stop_ids = [
            dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id
        ]

        dictionary.compactify()
        texts = [[word for word in document.split() if word not in stoplist]
                 for document in cut]

        dictionary.save("./" + channelID + ".dict")
        corpus = [dictionary.doc2bow(text) for text in texts]
        corpora.MmCorpus.serialize("./" + channelID + ".mm", corpus)

        # 載入語料庫
        if (os.path.exists("./" + channelID + ".dict")):
            dictionary = corpora.Dictionary.load("./" + channelID + ".dict")
            corpus = corpora.MmCorpus("./" + channelID + ".mm")
            print("Used files generated from first tutorial")
        else:
            print("Please run first tutorial to generate data set")
        # 創建 tfidf model
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]

        # 創建 LSI model 潛在語義索引
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=99)
        corpus_lsi = lsi[corpus_tfidf]  # LSI潛在語義索引
        lsi.save("./" + channelID + ".lsi")
        corpora.MmCorpus.serialize("./" + channelID + ".mm", corpus_lsi)
        print("LSI topics:")
        lsi.print_topics(10)

        vec_bow = dictionary.doc2bow(cut)
        # 用前面建好的 lsi 去計算這一篇歌詞
        vec_lsi = lsi[vec_bow]
        #print(vec_lsi)

        # 建立索引
        #index = similarities.MatrixSimilarity(lsi[corpus])
        #index = similarities.MatrixSimilarity(tfidf[corpus_tfidf])
        index = similarities.Similarity("./" + channelID + ".mm", corpus_tfidf,
                                        len(dictionary))
        index.save("./" + channelID + ".index")
        # 相似度
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        print(sims[:5])

        # 相似的前三首歌曲
        lyrics = []
        fp = open(cut)  # 斷詞後的歌詞
        for line in enumerate(fp):
            lyrics.append(line)
        fp.close()
        for lyric in sims[:3]:
            print("\n相似歌詞:", lyrics[lyric[0]])
            print("相似度:", lyric[1])