Esempio n. 1
0
 def get_base_url(self):
     if self.use_proxy:
         return get_random_proxy()
     else:
         return "https://weibo.cn"
Esempio n. 2
0
file_path = os.getcwd() + '/sina/seeds.txt'
start_uids = []
start_urls = []
with open(file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line[0].isdigit():
            line = line.strip()
            userid = line.split('#')[0].strip()
            start_uids.append(userid)
        elif line.startswith("http"):
            start_urls.append(line)

# push urls to redis
for uid in start_uids:
    if PROXY_BASEURL:
        base_url = get_random_proxy()
    else:
        base_url = "https://weibo.cn"
    start_url = base_url+("/%s/info" % uid)
    print("[DEBUG] URL: " + start_url)
    r.lpush('weibo_user_profile_spider:start_urls', start_url)

for url in start_urls:
    if PROXY_BASEURL:
        url = url.replace("https://weibo.cn",get_random_proxy())
    print("[DEBUG] URL: " + url)
    r.lpush('weibo_user_profile_spider:start_urls', url)

print('Redis initialized')
Esempio n. 3
0
client = MongoClient(LOCAL_MONGO_HOST, LOCAL_MONGO_PORT)
profiles_collection = client[DB_NAME]['user_profiles']

if PROFILE_GROUP > 0:
    seeds = profiles_collection.find({
        "timelineCrawlJob_current_complete": False,
        "group": PROFILE_GROUP
    })
else:
    seeds = profiles_collection.find(
        {"timelineCrawlJob_current_complete": False})

print(seeds.count(), "profiles found")
for seed in seeds:
    if PROXY_BASEURL:
        base_url = get_random_proxy()
    else:
        base_url = "https://weibo.cn"
    start_url = base_url + '/{}?page={}'.format(
        seed['_id'], seed['timelineCrawlJob_current_page'])
    print("[DEBUG] start url: " + start_url)
    r.lpush('weibo_user_timeline_spider:start_urls', start_url)

print('Redis initialized')

# push urls to redis
# for uid in start_uids:
#     start_url = base_url+("%s/info" % uid)
#     r.lpush('weibo_user_timeline_spider:start_urls', start_url)

# for url in start_urls:
Esempio n. 4
0
# get status ID for content truncated statuses
mydoc = collection.find(
    {"img_crawl_status": 0}
).limit(CRAWL_BATCH_SIZE)


print("Number of queued url: " + str(mydoc.count(True)))


for x in mydoc:

    if x["img_truncated"]==False:
        img_id = x["multi_img_ids"]
        if PROXY_BASEURL:
            image_server_number = random.randint(1,4)
            base_url = get_random_proxy("http://wx%d.sinaimg.cn/"%image_server_number)
        else:
            base_url = "http://wx1.sinaimg.cn"

        #img_url = x["single_img_url"].replace("https://weibo.cn",base_url)
        img_url = base_url + "/large/"+img_id
        #print("[DEBUG] url: "+img_url)
        r.lpush('weibo_image_spider:start_urls', img_url)
    else:
        if PROXY_BASEURL:
            base_url = get_random_proxy("https://weibo.cn/")
        else:
            base_url = "https://weibo.cn"
        multi_img_url = x["multi_imgs_page_url"].replace("https://weibo.cn",base_url)
        #print("[DEBUG] url: "+multi_img_url)
        r.lpush('weibo_image_spider:start_urls', multi_img_url)