import logging import requests import configs from core import common from time import sleep if __name__ == '__main__': configs.setting_logger() argv = sys.argv[1:] collection_path = argv[0] collection_header_path = argv[1] quality_score_path = argv[2] corpus = common.load_collection(collection_header_path, collection_path, configs.ENCODE_DECODE) with open(quality_score_path, 'w', encoding=configs.ENCODE_DECODE) as wf: for ii, cc in enumerate(corpus): v_id = cc['video_id'] img = 'https://img.youtube.com/vi/' + cc[ 'video_id'] + '/maxresdefault.jpg' rr = requests.head(img) if not rr.status_code == 200: logging.info('{}\t{}\t{}'.format(v_id, rr.status_code, img)) wf.write('{}\t{}'.format(v_id, 0)) else: wf.write('{}\t{}'.format(v_id, 1)) wf.write('\n') sleep(0.1) if ii and ii % 100 == 0:
import sys import configs from core.common import load_collection if __name__ == '__main__': configs.setting_logger() argv = sys.argv[1:] tokens_path = argv[0] tokens_header_path = argv[1] indices_path = argv[2] corpus = load_collection(tokens_header_path, tokens_path, encoding=configs.ENCODE_DECODE) with open(indices_path, 'w', encoding=configs.ENCODE_DECODE) as wf: for cc in corpus: doc_id = cc['video_id'] title_tokens = cc['title'] output = list() output.append(doc_id) values = [ '{}A0.0'.format(ii.split('A')[0]) for ii in title_tokens.split() ] output.extend(values) wf.write('\t'.join(output)) wf.write('\n')
if doc: _documents.append(' '.join(doc)) return _documents if __name__ == '__main__': configs.setting_logger() argv = sys.argv[1:] tokens_path = argv[0] tokens_header_path = argv[1] quality_score_path = argv[2] corpus = common.load_collection(tokens_header_path, tokens_path, configs.ENCODE_DECODE) g_video = _grouping_corpus(corpus, 'channel_id') with open(quality_score_path, 'w', encoding=configs.ENCODE_DECODE) as wf: for values in g_video.values(): documents = _make_documents(values) model = TfidfVectorizer( tokenizer=lambda x: x.split()).fit(documents) for vv in values: if not vv['caption']: continue title_desc = list() for tt in vv['title'].split() + vv['description'].split(): term, freq = tt.split('A') freq = int(freq)
if __name__ == '__main__': configs.setting_logger() argv = sys.argv[1:] collection_path = argv[0] collection_header_path = argv[1] title_indices_path = argv[2] desc_indices_path = argv[3] appended_collection_path = argv[4] appended_collection_header_path = argv[5] fields = common.load_fields(collection_header_path) collection = common.load_collection(collection_header_path, collection_path, encoding=configs.ENCODE_DECODE) title_indices = _load_expanded_indices(title_indices_path) desc_indices = _load_expanded_indices(desc_indices_path) with open(appended_collection_path, 'w', encoding=configs.ENCODE_DECODE) as wf: for cc in collection: v_id = cc['video_id'] title_keywords = list() for ii in title_indices.get(v_id, []): keyword = ii.split('A')[0] title_keywords.append(keyword) desc_keywords = list() for ii in desc_indices.get(v_id, []):