def get_hotness(cookie, f): def get_name(tag): return tag.find_all("h3")[0].text.encode("utf-8").strip() def get_likehood(tag): raw_likehood = tag.find_all("h4")[0].text return int(float(raw_likehood.split(":")[1][:-1]) * 10000) url = "http://energy.tv.weibo.cn/e/10173/index?display=0&retcode=6102" response = utils.request(url, "html", cookies=cookie) if response is None: return start_time = response.start_time.strftime("%Y%m%d,%H:%M:%S.%f") finish_time = response.finish_time.strftime("%Y%m%d,%H:%M:%S.%f") soup = BeautifulSoup(response.get_html(), "lxml") for tag in soup.find_all("div", class_="card25"): measures = tag.find_all("span") name = get_name(tag) likehood = get_likehood(tag) mentioned = measures[0].text interaction = measures[1].text cheer_cards = measures[2].text print(utils.to_csv_line(start_time, finish_time, name, likehood, mentioned, interaction, cheer_cards), file=f)
def get_followers(cookie, name, username, f): def retrive_followers_content(cookie, username): url = "https://weibo.cn/%s?display=0&retcode=6102" % username response = utils.request(url, "html", cookies=cookie) return response def extract_follower_from_content(content): selector = etree.HTML(content) str_gz = selector.xpath("//div[@class='tip2']/a/text()")[1] pattern = r"\d+\.?\d*" guid = re.findall(pattern, str_gz, re.M) followers = int(guid[0]) return followers response = retrive_followers_content(cookie, username) if response is None: return start_time = response.start_time.strftime("%Y%m%d,%H:%M:%S.%f") finish_time = response.finish_time.strftime("%Y%m%d,%H:%M:%S.%f") followers = extract_follower_from_content(response.get_html()) print(utils.to_csv_line(start_time, finish_time, name, username, followers), file=f)
def print_header(f): print(utils.to_csv_line("start_date", "start_time", "finish_date", "finish_time", "name", "gift", "vip_gift"), file=f)
def print_header(f): print(utils.to_csv_line("start_date", "start_time", "finish_date", "finish_time", "name", "gift", "vip_gift"), file=f) if __name__ == "__main__": utils.log_print("[** LOG **] Run vote with") try: url = "http://vote.i.iqiyi.com/eagle/outer/get_votes?uid=null&vids=0536210296010472&t=1518343644386" response = utils.request(url, "json") data = response.get_json_data() options = get_options(data) time = datetime.date.today().strftime("%Y%m%d") filename = "%s_gift_counts.csv" % time is_file = os.path.isfile(filename) with open(filename, "a") as f: if not is_file: print_header(f) for option in options: print(utils.to_csv_line(*extract_option(option)), file=f) utils.log_print("[** LOG **] Succeed running vote with") print(filename) except: utils.log_print("[** ERROR LOG **] Failed running vote with")
def print_hotness_header(f): print(utils.to_csv_line("start_date", "start_time", "finish_date", "finish_time", "name", "likehood", "mentioned", "interaction", "cheer_card"), file=f)
def print_follower_count_header(f): print(utils.to_csv_line("start_date", "start_time", "finish_date", "finish_time", "name", "username", "follower_count"), file=f)
def collect_entity_information(table_file, query_file, output_folder, cont=False): entity_information_file = output_folder + '/entities_to_information.csv' dict_fields = [ 'name', 'inlinks', 'outlinks', 'categories', 'page_views', 'nr_of_tables', 'nr_of_words' ] entities_existing = [] if cont: with open(entity_information_file, 'r') as f: for line in f.readlines(): d = from_csv_line(line, dict_fields) entities_existing.append(d['name']) with open('../dictionaries/table_to_entities.json', 'r') as f: table_entities = json.loads(f.read()) with open('../dictionaries/query_to_entities.json', 'r') as f: query_entities = json.loads(f.read()) entities_list = [] for k, v in table_entities.items(): entities_list += v for k, v in query_entities.items(): entities_list += v entities = sorted(list(set(entities_list))) else: table_entities, table_to_entities = find_all_entities_in_tables( table_file) query_entities, query_to_entities = find_all_entities_in_queries( query_file) entities = sorted(list(set(table_entities + query_entities))) write_dictionary_to_file(table_to_entities, output_folder + '/table_to_entities.json') write_dictionary_to_file(query_to_entities, output_folder + '/query_to_entities.json') for i, entity in enumerate(entities): if entity not in entities_existing: page = wiki.page(entity) if page.exists(): new_entity = {} new_entity[dict_fields[0]] = entity new_entity[dict_fields[1]] = list(page.backlinks.keys()) new_entity[dict_fields[2]] = list(page.links.keys()) new_entity[dict_fields[3]] = list(page.categories.keys()) new_entity[dict_fields[4]] = average_page_view(entity) new_entity[dict_fields[5]], new_entity[ dict_fields[6]] = nr_of_tables_and_words(page) with open(entity_information_file, 'a') as f: f.write(to_csv_line(new_entity, dict_fields)) else: print(f'Page {entity} does not exist') else: print(f'Entity {i} - {entity} already existed.') if i % 200 == 0: print(f'---- Wrote {i} / {len(entities)} to file.') print('Finished retrieving all entities!')