Ejemplo n.º 1
0
def download_pic(pic_data):
    split = pic_data.split("~")
    pic_dir = pic_save_path + split[0] + "/"
    pic_url = split[1]
    t.is_dir_existed(pic_dir)
    while True:
        proxy_ip = t.get_proxy_ip()
        print(proxy_ip)
        try:
            resp = requests.get(pic_url, proxies=proxy_ip, timeout=5)
            if resp is not None:
                print("下载图片:" + resp.request.url)
                pic_name = pic_url.split("/")[-1]
                with open(pic_dir + pic_name, "wb+") as f:
                    f.write(resp.content)
                return None
        except Exception as e:
            print(e)
Ejemplo n.º 2
0
def download_pic(pic_data):
    split = pic_data.split("~")
    pic_dir = c.ZZS_FLS_MZT_SAVE_PATH + split[0] + "/"
    pic_url = split[1]
    t.is_dir_existed(pic_dir)
    while True:
        proxy_ip = t.get_proxy_ip()
        print(proxy_ip)
        try:
            resp = requests.get(pic_url, proxies=proxy_ip, timeout=5)
            if resp is not None:
                print("下载图片:" + resp.request.url)
                pic_name = pic_url.split("/")[-1]
                with open(pic_dir + pic_name, "wb+") as f:
                    f.write(resp.content)
                return None
        except Exception as e:
            print(e)
 def __init__(self):
     if not t.is_dir_existed(save_file, mkdir=False):
         # 1.创建工作薄
         self.workbook = xlwt.Workbook()
         # 2.创建工作表,第二个参数用于确认同一个cell单元是否可以重设值
         self.sheet = self.workbook.add_sheet(u"豆瓣音乐Top 250", cell_overwrite_ok=True)
         # 3.初始化表头
         self.headTitles = [u'图片链接', u'歌名', u'歌手', u'发行时间', u'分类', u'评分', u'评分人数', u'歌曲详情页']
         for i, item in enumerate(self.headTitles):
             self.sheet.write(0, i, item, self.style('Monaco', 220, bold=True))
         self.workbook.save(save_file)
Ejemplo n.º 4
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-seg_file", default='test.txt', help='seg_file')
    parser.add_argument("-title_file", default='test.txt', help='title_file')
    parser.add_argument('-output_file',
                        default='output.txt',
                        help='output_file')
    args = parser.parse_args()

    seg_file = args.seg_file
    title_file = args.title_file
    seg_text = []
    title_text = []

    with open(seg_file) as f:
        for line in f:
            seg_text.append(line)
        f.close()

    with open(title_file) as f:
        for lien in f:
            title_text.append(line)

    t.is_dir_existed(c.outputs_documents_path)
    ppt_existed(ppt_file_name)
    presentation = Presentation(ppt_file_name)
    read_rules(presentation, rules_path)
    presentation.save(ppt_file_name)
Ejemplo n.º 5
0
            return None
    except Exception as e:
        pass


def fetch_meizi_pic(url):
    print("解析接口:" + url)
    try:
        resp = requests.get(url).json()
        return resp['results']
    except Exception as e:
        print(e)


if __name__ == '__main__':
    t.is_dir_existed(pic_save_dir)
    t.is_dir_existed(c.outputs_logs_path)
    print("检测图片图片url文件是否存在:")
    if t.is_dir_existed(pic_urls_file, mkdir=False):
        print("url文件已存在!")
    else:
        print("url文件不存在,开始解析图片接口...")
        cur_page = 1
        while True:
            results = fetch_meizi_pic(api_url + str(cur_page))
            if results is not None and len(results) > 0:
                for result in results:
                    t.write_str_data(result['url'], pic_urls_file)
                cur_page += 1
            else:
                break
Ejemplo n.º 6
0
            visual_range=[10, 2500],
            visual_text_color="#fff",
            symbol_size=15,
            is_visualmap=True)
    return geo


# 交友宣言词云
def draw_word_wc(name, count):
    wc = WordCloud(width=1300, height=620)
    wc.add("", name, count, word_size_range=[20, 100], shape='diamond')
    wc.render()


if __name__ == '__main__':
    if not t.is_dir_existed(result_save_file, mkdir=False):
        for i in range(1, 777):
            time.sleep(random.randint(2, 10))
            fetch_data(i)
    else:
        raw_data = pd.read_csv(result_save_file)
        word_result = word_pattern.sub("", ''.join(analysis_word(raw_data)))
        words = [
            word for word in jb.cut(word_result, cut_all=False)
            if len(word) >= 3
        ]
        exclude_words = [
            '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱',
            '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么',
            '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者',
            '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人',
Ejemplo n.º 7
0
    # 技能标签
    skill_list = []
    for skills in data['技能标签']:
        for skill in skills.strip().replace("[", "").replace("]", "").replace(
                "'", "").split(','):
            skill_list.append(skill)
    counter = dict(Counter(skill_list))
    counter.pop('')
    counter.pop('Android')
    make_wc(counter, pic_save_path + "wc_4.jpg")


# 处理数据
if __name__ == '__main__':
    t.is_dir_existed(pic_save_path)
    if not t.is_dir_existed(result_save_file, mkdir=False):
        fetch_data(1)
        for cur_page in range(2, max_page + 1):
            # 随缘休息5-15s
            time.sleep(random.randint(5, 15))
            fetch_data(cur_page)
    else:
        raw_data = pd.read_csv(result_save_file)
        # data_analysis(raw_data)
        # 筛选电子商务公司
        dzsw_result = raw_data.loc[raw_data["行业领域"].str.find("电子商务") != -1,
                                   ["行业领域", "公司全名"]]
        dzsw_result.to_csv(c.outputs_logs_path + "dzsw.csv",
                           header=False,
                           index=False,
Ejemplo n.º 8
0
        paragraph.font.name = '微软雅黑'
        paragraph.font.color.rgb = RGBColor(255, 255, 255)


# 读取配置文件调用模板的方法
def read_rules(prs, filename):
    if os.path.exists(filename):
        with open(filename, 'r+', encoding='utf-8') as f:
            for rule in f:
                word_list = rule.replace('\n', '').split(',')
                if 'png' in rule or 'jpg' in rule:
                    if len(word_list) == 1:
                        model_1(prs, os.path.join(c.res_pictures, word_list[0]))
                    else:
                        model_3(prs, word_list[0], os.path.join(c.res_pictures, word_list[1]))
                else:
                    if len(word_list) == 1:
                        model_2(prs, word_list[0])
                    elif len(word_list) == 2:
                        model_4(prs, word_list[0], word_list[1])
                    elif len(word_list) == 4:
                        model_5(prs, word_list[0], word_list[1], word_list[2], word_list[3])


if __name__ == '__main__':
    t.is_dir_existed(c.outputs_documents_path)
    ppt_existed(ppt_file_name)
    presentation = Presentation(ppt_file_name)
    read_rules(presentation, rules_path)
    presentation.save(ppt_file_name)
Ejemplo n.º 9
0

# 获取套图Url里所有的图片
def catch_pic_diagrams(url):
    resp = requests.get(url).content
    if resp is not None:
        soup = t.get_bs(resp)
        # 拿标题建文件夹
        title = soup.select("h1.article-title a")[0].text
        imgs = soup.select('article.article-content img')
        for img in imgs[:-1]:
            t.write_str_data(title + "~" + str(img['src']),
                             c.ZZS_FLS_MZT_URL_FILE_PATH + c.ZZS_FLS_MZT_URL_FILE_NAME)


if __name__ == '__main__':
    t.is_dir_existed(c.ZZS_FLS_MZT_URL_FILE_PATH)
    cur_page = 1
    while True:
        results = catch_pic_diagrams_url(c.ZZS_FLS_MZT_URL + str(cur_page))
        if results is not None and len(results) > 0:
            for result in results:
                catch_pic_diagrams(result)
            cur_page += 1
        else:
            break
    # 加载下载列表
    data_list = t.load_list_from_file(c.ZZS_FLS_MZT_URL_FILE_PATH + c.ZZS_FLS_MZT_URL_FILE_NAME)
    pool = multiprocessing.Pool()
    pool.map(download_pic, data_list)
Ejemplo n.º 10
0
# 阅读量访问线程
class Reader(t.Thread):
    def __init__(self, t_name, func):
        self.func = func
        t.Thread.__init__(self, name=t_name)

    def run(self):
        self.func()


# 阅读操作
def reading():
    while True:
        read_article_url(url_list[random.randint(0, len(url_list) - 1)])


if __name__ == '__main__':
    print("判断文章链接文件是否存在:")
    if not tools.is_dir_existed(articles_file, mkdir=False):
        print("链接文件不存在,抓取链接...")
        count = int(get_page_count())
        for i in range(1, count + 1):
            get_article_url(base_article_list + str(i) + '?viewmode=contents')
    else:
        print("链接文件存在")
    print("加载文章链接文件...")
    url_list = tools.load_list_from_file(articles_file)
    for i in range(100):
        reader = Reader("线程" + str(i), reading)
        reader.start()
Ejemplo n.º 11
0
# 阅读量访问线程
class Reader(t.Thread):
    def __init__(self, t_name, func):
        self.func = func
        t.Thread.__init__(self, name=t_name)

    def run(self):
        self.func()


# 阅读操作
def reading():
    while True:
        read_article_url(url_list[random.randint(0, len(url_list) - 1)])


if __name__ == '__main__':
    print("判断文章链接文件是否存在:")
    if not tools.is_dir_existed(articles_file, mkdir=False):
        print("链接文件不存在,抓取链接...")
        count = int(get_page_count())
        for i in range(1, count + 1):
            get_article_url(base_article_list + str(i) + '?viewmode=contents')
    else:
        print("链接文件存在")
    print("加载文章链接文件...")
    url_list = tools.load_list_from_file(articles_file)
    for i in range(100):
        reader = Reader("线程" + str(i), reading)
        reader.start()
Ejemplo n.º 12
0
              height=600, background_color='#404a59')
    attr, value = geo.cast(data)
    geo.add("", attr, value, visual_range=[10, 2500], visual_text_color="#fff",
            symbol_size=15, is_visualmap=True)
    return geo


# 交友宣言词云
def draw_word_wc(name, count):
    wc = WordCloud(width=1300, height=620)
    wc.add("", name, count, word_size_range=[20, 100], shape='diamond')
    wc.render()


if __name__ == '__main__':
    if not t.is_dir_existed(result_save_file, mkdir=False):
        for i in range(1, 777):
            time.sleep(random.randint(2, 10))
            fetch_data(i)
    else:
        raw_data = pd.read_csv(result_save_file)
        word_result = word_pattern.sub("", ''.join(analysis_word(raw_data)))
        words = [word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3]
        exclude_words = [
            '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象',
            '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好',
            '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意',
            '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼',
            '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发',
            '不怎么', '第一次', '越来越', '遇一人', '择一人', '无数次',
            '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉',
Ejemplo n.º 13
0
    salary_index = list(salary.index)
    salary_index.sort(key=lambda x: int(x))
    final_salary = salary.reindex(salary_index)
    plt.title("薪资条形图")
    final_salary.plot(kind='bar', rot=0)
    plt.xlabel("薪资/K")
    plt.ylabel("公司/个")
    plt.savefig(pic_save_path + 'result_7.jpg')
    plt.close(7)

    # 技能标签
    skill_list = []
    for skills in data['技能标签']:
        for skill in skills.strip().replace("[", "").replace("]", "").replace("'", "").split(','):
            skill_list.append(skill)
    counter = dict(Counter(skill_list))
    counter.pop('')
    make_wc(counter, pic_save_path + "wc_4.jpg")


# 处理数据
if __name__ == '__main__':
    t.is_dir_existed(pic_save_path)
    if not t.is_dir_existed(result_save_file, mkdir=False):
        fetch_data(1)
        for cur_page in range(2, max_page + 1):
            fetch_data(cur_page)
    else:
        raw_data = pd.read_csv(result_save_file)
        data_analysis(raw_data)
Ejemplo n.º 14
0

# 获取套图Url里所有的图片
def catch_pic_diagrams(url):
    resp = requests.get(url).content
    if resp is not None:
        soup = t.get_bs(resp)
        # 拿标题建文件夹
        title = soup.select("h1.article-title a")[0].text
        imgs = soup.select('article.article-content img')
        for img in imgs[:-1]:
            t.write_str_data(title + "~" + str(img['src']),
                             file_save_path)


if __name__ == '__main__':
    t.is_dir_existed(c.outputs_logs_path)
    cur_page = 1
    while True:
        results = catch_pic_diagrams_url(index_url + str(cur_page))
        if results is not None and len(results) > 0:
            for result in results:
                catch_pic_diagrams(result)
            cur_page += 1
        else:
            break
    # 加载下载列表
    data_list = t.load_list_from_file(file_save_path)
    pool = multiprocessing.Pool()
    pool.map(download_pic, data_list)
Ejemplo n.º 15
0
    return url_list


# 获取套图Url里所有的图片
def catch_pic_diagrams(url):
    resp = requests.get(url).content
    if resp is not None:
        soup = t.get_bs(resp)
        # 拿标题建文件夹
        title = soup.select("h1.article-title a")[0].text
        imgs = soup.select('article.article-content img')
        for img in imgs[:-1]:
            t.write_str_data(title + "~" + str(img['src']), file_save_path)


if __name__ == '__main__':
    t.is_dir_existed(c.outputs_logs_path)
    cur_page = 1
    while True:
        results = catch_pic_diagrams_url(index_url + str(cur_page))
        if results is not None and len(results) > 0:
            for result in results:
                catch_pic_diagrams(result)
            cur_page += 1
        else:
            break
    # 加载下载列表
    data_list = t.load_list_from_file(file_save_path)
    pool = multiprocessing.Pool()
    pool.map(download_pic, data_list)