tags = jsons['results'] urls = [] for tag in tags: cnt = int(tag['subscribersCount']) fmt = """https://api.leancloud.cn/1.1/classes/Entry?&where={"tagsTitleArray":{"$in":["%s"]}}&include=user&limit=%d&skip=%d&order=-rankIndex""" total = int(tag['entriesCount']) i = 0 while(i < total): print(fmt % (tag['title'], 20, i)) urls.append(fmt % (tag['title'], 20, i)) i += 20 #print("%s\t\t%s\t%s\t%s" % (tag['title'], tag['viewsCount'], tag['subscribersCount'], tag['objectId'])) #print(urls) db = Session() for url in urls: task = SpiderTask() task.project = 'juejin' task.task_id = md5str(url) task.url = url task.callback = 'on_index_page' task.priority = 0 task.last_time = 0 task.status = 0 db.add(task) db.commit() #task.create_at = now() print(url)
CREATE TABLE `wec_topic` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(255) NOT NULL, `uuid` varchar(32) NOT NULL, `url` varchar(1000) NOT NULL, `view_count` int default 0, `content` MEDIUMTEXT, `created_at` datetime DEFAULT NULL, `updated_at` datetime DEFAULT NULL, PRIMARY KEY (`id`), KEY `ix_task_id` (`uuid`) ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8; """ db = Session() tasks = SpiderTask.query.filter_by(status=200, callback='on_entry_page').all() for task in tasks: res = SpiderResult.query.filter_by(task_id=task.task_id).one() if res is None: print("Error, not found result %s" % task.task_id) # continue if task.result: print("save res %s" % task.task_id) resjson = json.loads(task.result) cnt = resjson['collectionCount'] title = resjson['title'] content = json.loads(res.content) task_id = res.task_id post = PostTopic()
fp.write(spider) else: print("path %s already exists." % (filename)) config_filename="projects/%s/project.yaml"%project_name if not os.path.exists(config_filename): with open(config_filename, "w+") as fp: fp.write(project_config) init_filename="projects/%s/__init__.py"%project_name if not os.path.exists(init_filename): with open(init_filename, "w+") as fp: fp.write("") from easyspider.db import SpiderProject, Session db = Session() project = SpiderProject.query.filter_by(name=project_name).first() if project is None: project = SpiderProject() project.name = project_name project.status = 0 project.process = '{}' project.queue_name= project_name db.add(project) db.commit()
from easyspider.db import Session , ScopedSession from sqlalchemy import Column, DateTime, String, Integer, ForeignKey, func from sqlalchemy.orm import relationship, backref from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.scoping import scoped_session engine = create_engine("mysql://*****:*****@localhost/spider?charset=utf8") scopedsession = scoped_session(sessionmaker(bind=engine)) Base = declarative_base() #SpiderTask.query.filter(SpiderTask.project=='test').all() db = Session() db.query(SpiderTask).filter(SpiderTask.project=='test').all() ScopedSession = scoped_session(sessionmaker()) q = scopedsession.query_property() q2 = scopedsession.query_property() print("scoped session", q, q==q2) class Bar(object): """docstring for Bar""" def __init__(self):