source_artifact = { "sourceCompanyId": source_company_id, "name": name, "description": None, "link": link, "type": 4010 } parser_util.insert_source_artifact(source_artifact) return source_company_id if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("recruit_neitui", "crawler_recruit_neitui") while True: try: for message in kafka_consumer: try: # logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, # message.offset, message.key, # message.value)) msg = json.loads(message.value) type = msg["type"] job_key = msg["job_key"] if type == "job": parse_job(job_key)
source_company_member_rel = { "sourceCompanyId": source_company_id, "position": m.get("position"), "joinDate": None, "leaveDate": None, "type": type } parser_util.insert_source_member(source_member, source_company_member_rel) if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("kr36", "crawler_kr36_v2") while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] company_key = msg["company_key"] if type == "company": parse_company(company_key) except Exception, e:
news_collection) = parser_util.parser_news_init("toutiao", "toutiao") i = 0 threads = [] msgs = [] while True: try: for message in kafka_consumer: try: # logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, # message.offset, message.key, # message.value)) msg = json.loads(message.value) type = msg["type"] news_key = msg["news_key"] if type == "direct_news": parse_news(news_key) # kafka_consumer.task_done(message) # kafka_consumer.commit() except Exception, e: logger.error(e) traceback.print_exc() except Exception, e: logger.error(e) traceback.print_exc() time.sleep(60) (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("toutiao", "toutiao")
i = 0 threads = [] msgs = [] while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] news_key = msg["news_key"] if type == "direct_news": parse_news(news_key) kafka_consumer.task_done(message) kafka_consumer.commit() except Exception, e: logger.error(e) traceback.print_exc() except Exception, e: logger.error(e) traceback.print_exc() time.sleep(60) (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("pencil_news", "pencil_news")
"salary": salary, "description": desc, "domain": domain, "locationId": location_id, "educationType": education_type, "workYearType": workYear_type, "startDate": born_time, "updateDate": update_time, } parser_util.insert_source_job(source_job) if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("recruit_jobtong", "crawler_recruit_jobtong") while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] company_key = msg["company_key"] if type == "company": parse_company(company_key)
'description': None, 'link': link, 'rank': rank, 'sourceId': source_id, 'type': 9030 } parser_util.insert_source_cf_document(source_cf_document) msg = {"type": "cf", "id": source_cf_id} kafka_producer.send_messages("parser_v2", json.dumps(msg)) if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("cf_36kr", "crawler_cf_36kr_v2") while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] cf_key = msg["cf_key"] if type == "cf": parse_cf(cf_key)
def parse_patch(cf_key): item = fromdb.cf.find_one({"source": source, "cf_key": cf_key}) if item is None: return desc = item["desc"] source_cf_id = parser_util.update_source_cf_desc(cf_key, desc) msg = {"type": "jd_patch", "id": source_cf_id} kafka_producer.send_messages("parser_v2", json.dumps(msg)) if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("cf_jd", "crawler_cf_jd_v2") while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] cf_key = msg["cf_key"] if type == "cf": parse_cf(cf_key)
"position": position, "salary": salary, "description": None, "domain": domain, "locationId": location_id, "educationType": education_type, "workYearType": workYear_type, "startDate": born_time, "updateDate": update_time, } parser_util.insert_source_job(source_job) if __name__ == '__main__': (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init("recruit_lagou", "crawler_recruit_lagou") i = 0 threads = [] msgs = [] while True: try: for message in kafka_consumer: try: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] company_key = msg["company_key"]
os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import requests import util import parser_util if __name__ == '__main__': if len(sys.argv) > 2: parser_name = sys.argv[1] msg_name = sys.argv[2] else: print "usage: /opt/py-env/bin/python consume_messages.py recruit_jobtong crawler_recruit_jobtong " exit(0) (logger, fromdb, kafka_producer, kafka_consumer) = parser_util.parser_init(parser_name, msg_name) for message in kafka_consumer: logger.info("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) msg = json.loads(message.value) type = msg["type"] company_key = msg["company_key"] if type == "company": kafka_consumer.task_done(message) kafka_consumer.commit()