def process_target(queue, cli_args, multi_log_lock): # 打印子进程启动消息 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), u'started.')) # 子进程先读取进程运行所需各种信息 utilities.init_logger() logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() print('\n\n') out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs # 创建一个和数据库交流的session session = Session() while True: if queue.qsize > 0: # 从队列中获取一个任务 task = queue.get() # 打印日志,获取到了任务 write_multiprocess_log( multi_log_lock, '{}Process {} get one task: {}'.format(u'', os.getpid(), task)) # 执行任务 process_task(task, out, multi_log_lock, session) else: time.sleep(0.5 * random.random()) continue
def do_write_db_multiprocess(session, obj_list, multi_log_lock): """write objects to database""" process_id = os.getpid() error_msg_ignore = "" error_msg_replace = "" flag = True # Adding and Updating Objects session.add_all(obj_list) try: # Commit the transaction session.commit() except Exception as e: session.rollback() flag = False error_msg_ignore = '{}'.format(e).decode("utf-8", errors="ignore") error_msg_replace = '{}'.format(e).decode("utf-8", errors="replace") print(error_msg_ignore) write_multiprocess_log( multi_log_lock, u'Process ' + unicode(process_id) + u': ' + error_msg_replace) else: # 执行成功 write_multiprocess_log( multi_log_lock, 'Process {}: {}'.format(process_id, u"Write to database successfully.")) finally: # Close the Session if session: session.close() return flag
def write_events(updated_events, multi_log_lock, session, multi_process_flag=False): """ write events to database parameter updated_events is a dictionary that contains all coded information and original information """ records_insert = [] for story_id in updated_events: # story's meta data story_meta = updated_events[story_id]["meta"] story_title = story_meta["title"] story_date = story_meta["date"] story_source = story_meta["source"] story_url = story_meta["url"] # all sentences in a story sents = updated_events[story_id]["sents"] # if the story was discarded when do_coding, no records will be generated to db if sents is None: break # sent_no表示当前句子在story中的编号 for sent_no in sents: # the current sentence sent = sents[sent_no] # if a sentence has no events, skip it if "events" not in sent: continue # the sentenct's original content sent_content = sent['content'] # all events extracted from the sentence sent_events = sent['events'] # event_no表示当前事件在句子中的编号 # event表示当前事件 for event_no, event in enumerate(sent_events): one_record = None # generate one record to be inserted into database one_record = generate_one_record(sent, event, story_id, sent_no, event_no, sent_content) if one_record: records_insert.append(one_record) # write to db if records_insert: if multi_process_flag: flag = do_write_db_multiprocess(session, records_insert, multi_log_lock) if flag: for story_id in updated_events: result = write_to_solr(story_id) if result is False: write_multiprocess_log( multi_log_lock, u'Process ' + unicode(os.getpid()) + u': ' + u'Something goes wrong in solr write, please check the log1' + u'id:story_id:%s' % story_id) else: flag = do_write_db(records_insert) if flag: for story_id in updated_events: result = write_to_solr(story_id) if result is False: logging.warn( 'something goes wrong in solr write,please check the log2' + 'id:story_id:%s' % story_id) else: # write to solr result = write_to_solr(story_id) if result is False: write_multiprocess_log( multi_log_lock, u'Process ' + unicode(os.getpid()) + u': ' + u'Something goes wrong in solr write, please check the log3' + u'id:story_id:%s' % story_id + u' records_insert:%s' % (','.join(records_insert)))
def process_target_bak(q, l, first_task, cli_args, multi_log_lock): # 子进程先读取进程运行所需各种信息 utilities.init_logger() logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() print('\n\n') out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs # 创建一个和数据库交流的session session = Session() # 子进程先完成第一个任务 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), first_task)) process_task(first_task, out, multi_log_lock, session) while l.acquire(): # 队列不为空,empty()方法不可靠,使用qsize() if q.qsize() != 0: # 从队列中获取下一个任务 task = q.get() # 任务获取完之后释放锁 l.release() # 完成获取到的任务 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), task)) process_task(task, out, multi_log_lock, session) # 队列为空 else: # 释放锁 l.release() # 跳出循环 break write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), u'exited...'))
def run_in_background(cli_args): # 读取多进程运行的必需参数 multiprocess_config = read_key_value_file(MULTI_PROCESS_CONFIG, '=') max_subprocesses = int(multiprocess_config['MAX_SUBPROCESSES']) max_stories_to_read = int(multiprocess_config['MAX_STORIES_TO_READ']) seconds_between_reads = int(multiprocess_config['SECONDS_BETWEEN_READS']) queue_size_under_control = int( multiprocess_config['QUEUE_SIZE_UNDER_CONTROL']) wait_for_consume = int(multiprocess_config['WAIT_FOR_CONSUME']) # 创建多进程日志文件的目录 if not os.path.isdir(MULTI_PROCESS_LOG_DIR): os.mkdir(MULTI_PROCESS_LOG_DIR) # 多进程日志锁 multi_log_lock = Lock() # 打印主进程启动消息,必须在创建了日志目录之后 write_multiprocess_log(multi_log_lock, u'Main process started successfully.') # dict containing all subprocesses subprocesses = {} # queue shared between processes queue = Queue() # 调试程序时使用,控制读取输入的次数 # count = 0 while True: # if count == 1: # continue # wait for subprocesses to consume queue before reading Solr while queue.qsize() >= queue_size_under_control: time.sleep(wait_for_consume) continue # 从Solr中读取输入 tmp_list = access_solr.read_stories(max_stories_to_read) if tmp_list is None: print("Solr connection error!") write_multiprocess_log(multi_log_lock, u'Solr connection error!') time.sleep(seconds_between_reads) continue elif len(tmp_list) == 0: time.sleep(seconds_between_reads) continue else: # 记录读到了多少条任务 write_multiprocess_log( multi_log_lock, '{}Main process read {} tasks from solr.'.format( u'', len(tmp_list))) # produce items for item in tmp_list: # these two lines should be removed item['content'] = item['content'].replace(u'’', u"'") item['content'] = item['content'].replace(u'”', u'"') queue.put(item) # 没有输入,进入下次循环,empty()方法不可靠,使用qsize() if queue.qsize() == 0: time.sleep(seconds_between_reads) continue # check if some processes have died terminated_procs_pids = [] for pid, proc in subprocesses.items(): if not proc.is_alive(): terminated_procs_pids.append(pid) # delete these from the subprocesses dict for terminated_proc in terminated_procs_pids: subprocesses.pop(terminated_proc) # 根据实际情况新增尽量少的子进程个数 new_processes = [] queue_size = queue.qsize() if len(subprocesses) < max_subprocesses: allow_num = max_subprocesses - len(subprocesses) create_num = queue_size if queue_size < allow_num else allow_num for i in range(create_num): proc = Process(target=process_target, args=(queue, cli_args, multi_log_lock)) new_processes.append(proc) for proc in new_processes: proc.start() subprocesses[proc.pid] = proc
def run_in_background_bak(cli_args): # 读取多进程运行的必需参数 multiprocess_config = read_key_value_file(MULTI_PROCESS_CONFIG, '=') max_subprocesses = int(multiprocess_config['MAX_SUBPROCESSES']) max_stories_to_read = int(multiprocess_config['MAX_STORIES_TO_READ']) kill_after_seconds = int(multiprocess_config['KILL_AFTER_SECONDS']) seconds_between_reads = int(multiprocess_config['SECONDS_BETWEEN_READS']) # 多进程日志文件的目录 if not os.path.isdir(MULTI_PROCESS_LOG_DIR): os.mkdir(MULTI_PROCESS_LOG_DIR) # 多进程同时打印日志 multi_log_lock = Lock() write_multiprocess_log(multi_log_lock, u'Main process started successfully.') # 调试程序时使用,控制读取输入的次数 # count = 0 while True: # if count == 1: # continue q = Queue() l = Lock() # 从数据库中读取输入 tmp_list = access_solr.read_stories(max_stories_to_read) if tmp_list is None: print("Solr connection error!") write_multiprocess_log(multi_log_lock, u'Solr connection error!') time.sleep(seconds_between_reads) continue for item in tmp_list: item['content'] = item['content'].replace(u'’', u"'") item['content'] = item['content'].replace(u'”', u'"') q.put(item) # 没有输入,进入下次读取输入,empty()方法不可靠,使用qsize() if q.qsize() == 0: time.sleep(seconds_between_reads) continue # 根据队列的实际大小创建合适个数的子进程 create_size = q.qsize( ) if q.qsize() < max_subprocesses else max_subprocesses processes = [] for i in range(create_size): # 确保每个子进程至少分到一个任务 first_task = q.get() p = Process(target=process_target, args=(q, l, first_task, cli_args, multi_log_lock)) processes.append(p) for p in processes: p.start() write_multiprocess_log(multi_log_lock, "All subprocesses have started.") pids = [] for p in processes: pids.append((p, p.pid)) time.sleep(kill_after_seconds) for p, pid in pids: if p.is_alive(): try: os.popen('taskkill.exe /pid:' + str(pid) + ' /f') except Exception: # print("Killing process " + str(pid) + " failed!") write_multiprocess_log( multi_log_lock, u"Killing process " + unicode(pid) + u" failed!") else: # print("Killing process " + str(pid) + " successfully.") write_multiprocess_log( multi_log_lock, u"Killing process " + unicode(pid) + u" successfully.")