Esempio n. 1
0
def process_target(queue, cli_args, multi_log_lock):
    # 打印子进程启动消息
    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  u'started.'))

    # 子进程先读取进程运行所需各种信息
    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    print('\n\n')

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    # 创建一个和数据库交流的session
    session = Session()

    while True:
        if queue.qsize > 0:
            # 从队列中获取一个任务
            task = queue.get()
            # 打印日志,获取到了任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Process {} get one task: {}'.format(u'', os.getpid(), task))
            # 执行任务
            process_task(task, out, multi_log_lock, session)
        else:
            time.sleep(0.5 * random.random())
            continue
Esempio n. 2
0
def do_write_db_multiprocess(session, obj_list, multi_log_lock):
    """write objects to database"""

    process_id = os.getpid()

    error_msg_ignore = ""
    error_msg_replace = ""
    flag = True

    # Adding and Updating Objects
    session.add_all(obj_list)

    try:
        # Commit the transaction
        session.commit()
    except Exception as e:
        session.rollback()
        flag = False
        error_msg_ignore = '{}'.format(e).decode("utf-8", errors="ignore")
        error_msg_replace = '{}'.format(e).decode("utf-8", errors="replace")
        print(error_msg_ignore)
        write_multiprocess_log(
            multi_log_lock,
            u'Process ' + unicode(process_id) + u': ' + error_msg_replace)
    else:
        # 执行成功
        write_multiprocess_log(
            multi_log_lock,
            'Process {}: {}'.format(process_id,
                                    u"Write to database successfully."))
    finally:
        # Close the Session
        if session:
            session.close()

    return flag
Esempio n. 3
0
def write_events(updated_events,
                 multi_log_lock,
                 session,
                 multi_process_flag=False):
    """
    write events to database
    parameter updated_events is a dictionary that contains all coded information and original information
    """

    records_insert = []
    for story_id in updated_events:
        # story's meta data
        story_meta = updated_events[story_id]["meta"]
        story_title = story_meta["title"]
        story_date = story_meta["date"]
        story_source = story_meta["source"]
        story_url = story_meta["url"]

        # all sentences in a story
        sents = updated_events[story_id]["sents"]

        # if the story was discarded when do_coding, no records will be generated to db
        if sents is None:
            break

        # sent_no表示当前句子在story中的编号
        for sent_no in sents:

            # the current sentence
            sent = sents[sent_no]

            # if a sentence has no events, skip it
            if "events" not in sent:
                continue

            # the sentenct's original content
            sent_content = sent['content']
            # all events extracted from the sentence
            sent_events = sent['events']

            # event_no表示当前事件在句子中的编号
            # event表示当前事件
            for event_no, event in enumerate(sent_events):
                one_record = None
                # generate one record to be inserted into database
                one_record = generate_one_record(sent, event, story_id,
                                                 sent_no, event_no,
                                                 sent_content)
                if one_record:
                    records_insert.append(one_record)

    # write to db
    if records_insert:
        if multi_process_flag:
            flag = do_write_db_multiprocess(session, records_insert,
                                            multi_log_lock)
            if flag:
                for story_id in updated_events:
                    result = write_to_solr(story_id)
                    if result is False:
                        write_multiprocess_log(
                            multi_log_lock,
                            u'Process ' + unicode(os.getpid()) + u': ' +
                            u'Something goes wrong in solr write, please check the log1'
                            + u'id:story_id:%s' % story_id)
        else:
            flag = do_write_db(records_insert)
            if flag:
                for story_id in updated_events:
                    result = write_to_solr(story_id)
                    if result is False:
                        logging.warn(
                            'something goes wrong in solr write,please check the log2'
                            + 'id:story_id:%s' % story_id)
    else:
        # write to solr
        result = write_to_solr(story_id)
        if result is False:
            write_multiprocess_log(
                multi_log_lock, u'Process ' + unicode(os.getpid()) + u': ' +
                u'Something goes wrong in solr write, please check the log3' +
                u'id:story_id:%s' % story_id + u'       records_insert:%s' %
                (','.join(records_insert)))
Esempio n. 4
0
def process_target_bak(q, l, first_task, cli_args, multi_log_lock):

    # 子进程先读取进程运行所需各种信息
    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    print('\n\n')

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    # 创建一个和数据库交流的session
    session = Session()

    # 子进程先完成第一个任务
    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  first_task))
    process_task(first_task, out, multi_log_lock, session)

    while l.acquire():
        # 队列不为空,empty()方法不可靠,使用qsize()
        if q.qsize() != 0:
            # 从队列中获取下一个任务
            task = q.get()
            # 任务获取完之后释放锁
            l.release()
            # 完成获取到的任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Process {}: {}'.format(u'', os.getpid(), task))
            process_task(task, out, multi_log_lock, session)
        # 队列为空
        else:
            # 释放锁
            l.release()
            # 跳出循环
            break

    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  u'exited...'))
Esempio n. 5
0
def run_in_background(cli_args):
    # 读取多进程运行的必需参数
    multiprocess_config = read_key_value_file(MULTI_PROCESS_CONFIG, '=')
    max_subprocesses = int(multiprocess_config['MAX_SUBPROCESSES'])
    max_stories_to_read = int(multiprocess_config['MAX_STORIES_TO_READ'])
    seconds_between_reads = int(multiprocess_config['SECONDS_BETWEEN_READS'])
    queue_size_under_control = int(
        multiprocess_config['QUEUE_SIZE_UNDER_CONTROL'])
    wait_for_consume = int(multiprocess_config['WAIT_FOR_CONSUME'])

    # 创建多进程日志文件的目录
    if not os.path.isdir(MULTI_PROCESS_LOG_DIR):
        os.mkdir(MULTI_PROCESS_LOG_DIR)

    # 多进程日志锁
    multi_log_lock = Lock()
    # 打印主进程启动消息,必须在创建了日志目录之后
    write_multiprocess_log(multi_log_lock,
                           u'Main process started successfully.')

    # dict containing all subprocesses
    subprocesses = {}
    # queue shared between processes
    queue = Queue()

    # 调试程序时使用,控制读取输入的次数
    # count = 0

    while True:
        # if count == 1:
        #     continue

        # wait for subprocesses to consume queue before reading Solr
        while queue.qsize() >= queue_size_under_control:
            time.sleep(wait_for_consume)
            continue

        # 从Solr中读取输入
        tmp_list = access_solr.read_stories(max_stories_to_read)
        if tmp_list is None:
            print("Solr connection error!")
            write_multiprocess_log(multi_log_lock, u'Solr connection error!')
            time.sleep(seconds_between_reads)
            continue
        elif len(tmp_list) == 0:
            time.sleep(seconds_between_reads)
            continue
        else:
            # 记录读到了多少条任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Main process read {} tasks from solr.'.format(
                    u'', len(tmp_list)))

        # produce items
        for item in tmp_list:
            # these two lines should be removed
            item['content'] = item['content'].replace(u'’', u"'")
            item['content'] = item['content'].replace(u'”', u'"')

            queue.put(item)

        # 没有输入,进入下次循环,empty()方法不可靠,使用qsize()
        if queue.qsize() == 0:
            time.sleep(seconds_between_reads)
            continue

        # check if some processes have died
        terminated_procs_pids = []
        for pid, proc in subprocesses.items():
            if not proc.is_alive():
                terminated_procs_pids.append(pid)
        # delete these from the subprocesses dict
        for terminated_proc in terminated_procs_pids:
            subprocesses.pop(terminated_proc)

        # 根据实际情况新增尽量少的子进程个数
        new_processes = []
        queue_size = queue.qsize()
        if len(subprocesses) < max_subprocesses:
            allow_num = max_subprocesses - len(subprocesses)
            create_num = queue_size if queue_size < allow_num else allow_num
            for i in range(create_num):
                proc = Process(target=process_target,
                               args=(queue, cli_args, multi_log_lock))
                new_processes.append(proc)
            for proc in new_processes:
                proc.start()
                subprocesses[proc.pid] = proc
Esempio n. 6
0
def run_in_background_bak(cli_args):

    # 读取多进程运行的必需参数
    multiprocess_config = read_key_value_file(MULTI_PROCESS_CONFIG, '=')
    max_subprocesses = int(multiprocess_config['MAX_SUBPROCESSES'])
    max_stories_to_read = int(multiprocess_config['MAX_STORIES_TO_READ'])
    kill_after_seconds = int(multiprocess_config['KILL_AFTER_SECONDS'])
    seconds_between_reads = int(multiprocess_config['SECONDS_BETWEEN_READS'])

    # 多进程日志文件的目录
    if not os.path.isdir(MULTI_PROCESS_LOG_DIR):
        os.mkdir(MULTI_PROCESS_LOG_DIR)

    # 多进程同时打印日志
    multi_log_lock = Lock()

    write_multiprocess_log(multi_log_lock,
                           u'Main process started successfully.')

    # 调试程序时使用,控制读取输入的次数
    # count = 0

    while True:
        # if count == 1:
        #     continue
        q = Queue()
        l = Lock()
        # 从数据库中读取输入
        tmp_list = access_solr.read_stories(max_stories_to_read)
        if tmp_list is None:
            print("Solr connection error!")
            write_multiprocess_log(multi_log_lock, u'Solr connection error!')
            time.sleep(seconds_between_reads)
            continue

        for item in tmp_list:
            item['content'] = item['content'].replace(u'’', u"'")
            item['content'] = item['content'].replace(u'”', u'"')
            q.put(item)
        # 没有输入,进入下次读取输入,empty()方法不可靠,使用qsize()
        if q.qsize() == 0:
            time.sleep(seconds_between_reads)
            continue
        # 根据队列的实际大小创建合适个数的子进程
        create_size = q.qsize(
        ) if q.qsize() < max_subprocesses else max_subprocesses
        processes = []
        for i in range(create_size):
            # 确保每个子进程至少分到一个任务
            first_task = q.get()
            p = Process(target=process_target,
                        args=(q, l, first_task, cli_args, multi_log_lock))
            processes.append(p)
        for p in processes:
            p.start()
        write_multiprocess_log(multi_log_lock,
                               "All subprocesses have started.")

        pids = []
        for p in processes:
            pids.append((p, p.pid))

        time.sleep(kill_after_seconds)
        for p, pid in pids:
            if p.is_alive():
                try:
                    os.popen('taskkill.exe /pid:' + str(pid) + ' /f')
                except Exception:
                    # print("Killing process " + str(pid) + " failed!")
                    write_multiprocess_log(
                        multi_log_lock,
                        u"Killing process " + unicode(pid) + u" failed!")
                else:
                    # print("Killing process " + str(pid) + " successfully.")
                    write_multiprocess_log(
                        multi_log_lock,
                        u"Killing process " + unicode(pid) + u" successfully.")