Ejemplo n.º 1
0
def get_hotness(cookie, f):
    def get_name(tag):
        return tag.find_all("h3")[0].text.encode("utf-8").strip()

    def get_likehood(tag):
        raw_likehood = tag.find_all("h4")[0].text
        return int(float(raw_likehood.split(":")[1][:-1]) * 10000)

    url = "http://energy.tv.weibo.cn/e/10173/index?display=0&retcode=6102"
    response = utils.request(url, "html", cookies=cookie)
    if response is None:
        return
    start_time = response.start_time.strftime("%Y%m%d,%H:%M:%S.%f")
    finish_time = response.finish_time.strftime("%Y%m%d,%H:%M:%S.%f")

    soup = BeautifulSoup(response.get_html(), "lxml")
    for tag in soup.find_all("div", class_="card25"):
        measures = tag.find_all("span")

        name = get_name(tag)
        likehood = get_likehood(tag)
        mentioned = measures[0].text
        interaction = measures[1].text
        cheer_cards = measures[2].text
        print(utils.to_csv_line(start_time, finish_time, name, likehood,
                                mentioned, interaction, cheer_cards),
              file=f)
Ejemplo n.º 2
0
def get_followers(cookie, name, username, f):
    def retrive_followers_content(cookie, username):
        url = "https://weibo.cn/%s?display=0&retcode=6102" % username
        response = utils.request(url, "html", cookies=cookie)
        return response

    def extract_follower_from_content(content):
        selector = etree.HTML(content)
        str_gz = selector.xpath("//div[@class='tip2']/a/text()")[1]
        pattern = r"\d+\.?\d*"
        guid = re.findall(pattern, str_gz, re.M)
        followers = int(guid[0])
        return followers

    response = retrive_followers_content(cookie, username)
    if response is None:
        return
    start_time = response.start_time.strftime("%Y%m%d,%H:%M:%S.%f")
    finish_time = response.finish_time.strftime("%Y%m%d,%H:%M:%S.%f")
    followers = extract_follower_from_content(response.get_html())
    print(utils.to_csv_line(start_time, finish_time, name, username,
                            followers),
          file=f)
Ejemplo n.º 3
0
def print_header(f):
    print(utils.to_csv_line("start_date", "start_time", "finish_date",
                            "finish_time", "name", "gift", "vip_gift"),
          file=f)
Ejemplo n.º 4
0

def print_header(f):
    print(utils.to_csv_line("start_date", "start_time", "finish_date",
                            "finish_time", "name", "gift", "vip_gift"),
          file=f)


if __name__ == "__main__":

    utils.log_print("[** LOG **] Run vote with")
    try:
        url = "http://vote.i.iqiyi.com/eagle/outer/get_votes?uid=null&vids=0536210296010472&t=1518343644386"
        response = utils.request(url, "json")
        data = response.get_json_data()

        options = get_options(data)

        time = datetime.date.today().strftime("%Y%m%d")
        filename = "%s_gift_counts.csv" % time
        is_file = os.path.isfile(filename)
        with open(filename, "a") as f:
            if not is_file:
                print_header(f)
            for option in options:
                print(utils.to_csv_line(*extract_option(option)), file=f)
        utils.log_print("[** LOG **] Succeed running vote with")
        print(filename)
    except:
        utils.log_print("[** ERROR LOG **] Failed running vote with")
Ejemplo n.º 5
0
def print_hotness_header(f):

    print(utils.to_csv_line("start_date", "start_time", "finish_date",
                            "finish_time", "name", "likehood", "mentioned",
                            "interaction", "cheer_card"),
          file=f)
Ejemplo n.º 6
0
def print_follower_count_header(f):

    print(utils.to_csv_line("start_date", "start_time", "finish_date",
                            "finish_time", "name", "username",
                            "follower_count"),
          file=f)
def collect_entity_information(table_file,
                               query_file,
                               output_folder,
                               cont=False):
    entity_information_file = output_folder + '/entities_to_information.csv'
    dict_fields = [
        'name', 'inlinks', 'outlinks', 'categories', 'page_views',
        'nr_of_tables', 'nr_of_words'
    ]

    entities_existing = []
    if cont:
        with open(entity_information_file, 'r') as f:
            for line in f.readlines():
                d = from_csv_line(line, dict_fields)
                entities_existing.append(d['name'])

        with open('../dictionaries/table_to_entities.json', 'r') as f:
            table_entities = json.loads(f.read())

        with open('../dictionaries/query_to_entities.json', 'r') as f:
            query_entities = json.loads(f.read())

        entities_list = []
        for k, v in table_entities.items():
            entities_list += v

        for k, v in query_entities.items():
            entities_list += v

        entities = sorted(list(set(entities_list)))
    else:
        table_entities, table_to_entities = find_all_entities_in_tables(
            table_file)
        query_entities, query_to_entities = find_all_entities_in_queries(
            query_file)
        entities = sorted(list(set(table_entities + query_entities)))

        write_dictionary_to_file(table_to_entities,
                                 output_folder + '/table_to_entities.json')
        write_dictionary_to_file(query_to_entities,
                                 output_folder + '/query_to_entities.json')

    for i, entity in enumerate(entities):
        if entity not in entities_existing:
            page = wiki.page(entity)

            if page.exists():
                new_entity = {}
                new_entity[dict_fields[0]] = entity
                new_entity[dict_fields[1]] = list(page.backlinks.keys())
                new_entity[dict_fields[2]] = list(page.links.keys())
                new_entity[dict_fields[3]] = list(page.categories.keys())
                new_entity[dict_fields[4]] = average_page_view(entity)
                new_entity[dict_fields[5]], new_entity[
                    dict_fields[6]] = nr_of_tables_and_words(page)
                with open(entity_information_file, 'a') as f:
                    f.write(to_csv_line(new_entity, dict_fields))
            else:
                print(f'Page {entity} does not exist')
        else:
            print(f'Entity {i} - {entity} already existed.')
        if i % 200 == 0:
            print(f'---- Wrote {i} / {len(entities)} to file.')

    print('Finished retrieving all entities!')