Python read_file Exemples, util.util.read_file Python Exemples

Exemple #1

0

Afficher le fichier

def clean_csv(av_id: int):
    ''' clean csv '''
    csv_path = os.path.join(history_dir, '{}.csv'.format(av_id))
    output_path = os.path.join(history_data_dir, '{}_new.csv'.format(av_id))
    csv = read_file(csv_path)
    last_time, last_view = csv[0].split(',')[:2]
    result = [csv[0]]
    last_time = time_stamp(last_time)
    last_view = int(last_view)
    empty_line = ','.join([' '] * (len(csv[0].split(',')) + 1))
    for line in csv[1:]:
        now_time, now_view = line.split(',')[:2]
        now_time = time_stamp(now_time)
        now_view = int(now_view)
        time_gap = now_time - last_time

        if now_view < last_view or now_view - last_view > 5000:
            # echo(1, last_view, last_time, now_view, now_time)
            continue
        if abs(time_gap) > 150:
            for ii in range(int((time_gap - 30) // 120)):
                result.append(empty_line)
        if abs(time_gap) > 90:
            # echo(0, last_view, last_time, now_view, now_time)
            result.append(line)
            last_view, last_time = now_view, now_time
        # else:
        #     echo(2, last_view, last_time, now_view, now_time)
    with open(output_path, 'w') as f:
        f.write('\n'.join(result))

Exemple #2

0

Afficher le fichier

Fichier : titleviews.py Projet : zhujuanzhu/spider

 def getTitleMap(self):
     ''' get title map '''
     slug = read_file('{}slug'.format(data_dir))
     title = read_file('{}title'.format(data_dir))
     self.title_map = {
         tempslug.split('"')[1]: title[num].split('"')[1]
         for num, tempslug in enumerate(slug)
     }
     title2slug = {
         self.title_map[index]: index
         for index in self.title_map.keys()
     }
     noemoji_title = {
         self.filter_emoji(self.title_map[index]).replace('\u200d', ''):
         index
         for index in self.title_map.keys()
     }
     self.title2slug = {**noemoji_title, **title2slug}

Exemple #3

0

Afficher le fichier

Fichier : titleviews.py Projet : zhujuanzhu/spider

    def getZhihuView(self):
        cookie = ''.join(read_file('{}cookie'.format(data_dir)))
        changeCookie(cookie)
        url_basic = [
            self.ZHIHU_URL,
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no='
        ]
        url = ''.join(url_basic)

        json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                echo('0|warning', json)
            return
        echo(3, 'zhihu', json)
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                echo('0|debug', index['title'])

        for index in range(1, json['count'] // 10):
            echo(1, 'zhihu', index)
            json = self.get_request('{}{}'.format(url, 1 + index), 1,
                                    lambda i: not i)
            echo(2, 'zhihu', json)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    echo('0|debug', index['title'])

Exemple #4

0

Afficher le fichier

Fichier : db.py Projet : onlyoneprogram/spider

 def create_table(self, sql_path: str):
     if not os.path.exists(sql_path):
         echo(0, "Create Table {} error, file not found".format(sql_path))
         return False
     create_table_sql = "\n".join(read_file(sql_path))
     try:
         cursor = self.db.cursor()
         cursor.execute(create_table_sql)
         echo(2, "Create Table from {} Success!!!".format(sql_path))
         return True
     except Exception as e:
         echo(0, "Create Table from {} error".format(sql_path), e)
         return False

Exemple #5

0

Afficher le fichier

Fichier : titleviews.py Projet : zhujuanzhu/spider

 def loadLocalView(self):
     '''  load local view '''
     test = read_file('{}google'.format(data_dir))[7:]
     for index in test:
         arr = index.split(',')
         slug = self.matchSlug(arr[0])
         if slug is None or slug not in self.title_map:
             continue
         print(slug + ' ' + str(arr[1]) + ' ' + arr[0])
         if slug in self.local_views:
             self.local_views[slug] += int(arr[1])
         else:
             self.local_views[slug] = int(arr[1])

Exemple #6

0

Afficher le fichier

 def load_history_file(self, av_id: int, av_info: dict):
     data_path = '{}{}_new.csv'.format(history_data_dir, av_id)
     history_list = read_file(data_path)[:2880]
     if not len(history_list):
         return
     created, title = av_info['created'], av_info['title']
     history_list = [ii.split(',') for ii in history_list]
     time_map = {
         round((time_stamp(ii[0]) - created) / 120) * 2: ii
         for ii in history_list if ii[0] != ''
     }
     last_data = [0] * 8
     for ii in self.history_map.keys():
         if ii in time_map:
             self.history_map[ii][av_id] = time_map[ii]
             last_data = time_map[ii] + last_data[len(time_map[ii]):]
         else:
             self.history_map[ii][av_id] = last_data

Exemple #7

0

Afficher le fichier

Fichier : upBilibili.py Projet : zhujuanzhu/spider

 def load_history_file(self, bv_id: str, bv_info: dict):
     data_path = "{}{}_new.csv".format(history_data_dir, bv_id)
     history_list = read_file(data_path)[:3660]
     if not len(history_list):
         return
     created, title = bv_info["created"], bv_info["title"]
     history_list = [ii.split(",") for ii in history_list]
     time_map = {
         round((time_stamp(ii[0]) - created) / 120) * 2: ii
         for ii in history_list if ii[0] != ""
     }
     last_data = [0] * 8
     for ii in self.history_map.keys():
         if ii in time_map:
             self.history_map[ii][bv_id] = time_map[ii]
             last_data = time_map[ii] + last_data[len(time_map[ii]):]
         else:
             self.history_map[ii][bv_id] = last_data

Exemple #8

0

Afficher le fichier

 def load_article_local(self, file_path: str):
     if file_path not in self.tpwds:
         tt = '||||'.join(read_file(file_path))
         tpwds = regex.findall(self.TPWD_REG, tt)
         self.tpwds[file_path] = tpwds
     else:
         tpwds = self.tpwds[file_path]
     if file_path not in self.tpwd_map:
         self.tpwd_map[file_path] = {}
     time = 0
     while (len(self.tpwd_map[file_path]) < len(tpwds)) and time < 5:
         thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[file_path]]
         echo(1, file_path, "tpwds len:", len(tpwds), "need load", len(thread_list))
         thread_list = [
             self.tpwd_exec.submit(self.decoder_tpwd_once, file_path, ii, 1)
             for ii in thread_list
         ]
         list(as_completed(thread_list))
         time += 1

Exemple #9

0

Afficher le fichier

Fichier : getproxy.py Projet : onlyoneprogram/spider

 def gatherproxy(self, types: int):
     """
     :100: very nice website
     first of all you should download proxy ip txt from:
     http://www.gatherproxy.com/zh/proxylist/country/?c=China
     """
     if not os.path.exists("{}gatherproxy".format(data_dir)):
         echo("0|warning", "Gather file not exist!!!")
         return
     file_d = read_file("{}gatherproxy".format(data_dir))
     waitjudge_http = ["http://" + ii for ii in file_d]
     waitjudge_https = ["https://" + ii for ii in file_d]
     if not types:
         self.waitjudge += waitjudge_http
     elif types == 1:
         self.waitjudge += waitjudge_https
     elif types == 2:
         self.waitjudge += waitjudge_http + waitjudge_https
     else:
         self.waitjudge += file_d
     echo("2|warning", "load gather over!")