Beispiel #1
0
def start_detail_spider(account, start_index):
    mongo = db_tool.create_mongo_session()
    detail_session = mongo.get_database('xiaomuchong_db').get_collection('detail')
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(executable_path='webdriver/chromedriver.exe', options=chrome_options)
    main_page = 'http://muchong.com/bbs/'
    driver.get(url=main_page)
    if account == 1:
        driver.add_cookie({'name': '_discuz_uid', 'value': '24725517'})
    else:
        driver.add_cookie({'name': '_discuz_uid', 'value': '24724882'})
    driver.add_cookie({'name': '_discuz_pw', 'value': '789cb43e40ab0032'})
 def __init__(self, account: int):
     self.__mongo = db_tool.create_mongo_session()
     chrome_options = Options()
     chrome_options.add_argument('--headless')
     chrome_options.add_argument('--disable-gpu')
     self.__driver = webdriver.Chrome(
         executable_path='webdriver/chromedriver.exe',
         options=chrome_options)
     main_page = 'http://muchong.com/bbs/'
     self.__driver.get(url=main_page)
     accounts = ['25073725', '25072651', '24723952', '24724882']
     self.__driver.add_cookie({
         'name': '_discuz_uid',
         'value': accounts[account]
     })
     self.__driver.add_cookie({
         'name': '_discuz_pw',
         'value': '789cb43e40ab0032'
     })
 def __init__(self):
     self.__session = db_tool.create_mongo_session().get_database('xiaomuchong_db').get_collection('detail')
     self.__all_dicts = list(self.__session.find())
from bs4 import BeautifulSoup
import os
import re
import db_tool
from tqdm import tqdm
import shutil
import pymongo.errors

detail_session = db_tool.create_mongo_session().get_database(
    'xiaomuchong_db').get_collection('detail')
source_files = os.listdir('E:/Xiaomuchong/DetailPages')
for source_file in tqdm(source_files):
    source_file_name = 'E:/Xiaomuchong/DetailPages/' + source_file
    topic_id = source_file.split('-')[0]
    reader = open(source_file_name, 'r', encoding='utf-8')
    lines = reader.read()
    reader.close()
    soup = BeautifulSoup(lines, 'lxml')
    id_re = re.compile('pid.+')
    content_elements = soup.find_all(attrs={'id': id_re})
    for content_element in content_elements:
        pid = content_element['id'][3:]
        post_id = topic_id + '_' + pid
        post_content_element = content_element.find(class_='t_fsz')
        if not post_content_element:
            continue
        quote_element = post_content_element.find('fieldset')
        quote_pid = 0
        if quote_element:
            quote_text_element = quote_element.find('a')
            if not quote_text_element:
Beispiel #5
0
import db_tool

session = db_tool.create_mongo_session()
topic_session = session.get_database('xiaomuchong').get_collection('topic')
detail_session = session.get_database('xiaomuchong').get_collection('detail')
no_major = list(detail_session.find({'poster_major': '未知'}))
print(len(no_major))
import db_tool

session = db_tool.create_mongo_session().get_database('tool_db').get_collection('chinese_word2vec')
reader = open('C:/Users/macha/Downloads/Tencent_AILab_ChineseEmbedding.txt', 'r', encoding='utf-8')
reader.readline()
line_count = 0
total_line_count = 0
word2vec_dicts = []
while True:
    line = reader.readline()
    word = line.split(' ')[0]
    vector = line.split(' ')[1:]
    vector = list(map(lambda o: float(o), vector))
    word2vec_dict = dict(word=word, vector=vector)
    word2vec_dicts.append(word2vec_dict)
    line_count += 1
    total_line_count += 1
    if line_count == 20000:
        session.insert_many(word2vec_dicts)
        word2vec_dicts.clear()
        line_count = 0
        print('已完成{0}行'.format(str(total_line_count)))