def get_parents(node_id):
    if not node_id:
        return []

    node = db_session.query(Node).get(node_id)
    nodes = db_session.query(Node).filter(Node.id.in_(node.path), Node.active.is_(True)).order_by(Node.path_length).all()
    return nodes
def incoherent_nodes():
    node_utterance_ids = (db_session.query(
        NodeUtteranceStatus.node_utterance_id,
        func.count(NodeUtteranceStatus.node_utterance_id),
    ).filter(
        NodeUtteranceStatus.status == "incoherent",
        NodeUtteranceStatus.handled.is_(False),
    ).group_by(NodeUtteranceStatus.node_utterance_id).order_by(
        func.count(NodeUtteranceStatus.node_utterance_id).desc()).all())
    for node_utterance_id, count in node_utterance_ids:
        node_utterance = db_session.query(NodeUtterance).get(node_utterance_id)
        history = (db_session.query(Node).filter(
            Node.id.in_(node_utterance.node.path)).order_by(
                Node.path_length.asc()).all())
        print("+----------------------------------+")
        print("node utterance id:", node_utterance_id)
        print("node id:", node_utterance.node.id)
        print("count:", count)
        print("\n")

        for h in history[:-1]:
            print([x.utterance_text for x in h.utterances])
        print(node_utterance.utterance.utterance_text)
        print("\n")
        user_input = input("Inactivate? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input == "" or user_input.lower() == "y":
            inactivate_node(node_utterance.node.id)

        db_session.query(NodeUtteranceStatus).filter(
            NodeUtteranceStatus.node_utterance_id == node_utterance_id,
            NodeUtteranceStatus.status == "incoherent",
        ).update({"handled": True})
        db_session.commit()
Exemple #3
0
def submit(external_worker_id, task_id):
    if task_id not in TASK_IDS:
        raise KeyError('Task id not recognized')

    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()
    if not worker:
        worker = Worker(external_worker_id=external_worker_id)
        db_session.add(worker)
        db_session.commit()

    training = db_session.query(Training).filter_by(worker=worker).first()

    if not training:
        training = Training(worker=worker)
        db_session.add(training)
        db_session.commit()

    if task_id == min(set(TASK_IDS) - set(training.tasks)):
        training.tasks = training.tasks + [task_id]
        db_session.commit()

    new_set = set(TASK_IDS) - set(training.tasks)

    if not new_set:
        return True
    else:
        return False
def set_parent(node_id, parent_node_id=None, commit=False):
    node = db_session.query(Node).get(node_id)
    if not node:
        raise Exception("Could not find node")
    if parent_node_id:
        parent_node = db_session.query(Node).get(parent_node_id)
    else:
        parent_node = None

    old_parent_id = None
    if node.parent:
        old_parent_id = node.parent.id

    node.parent = parent_node
    node.path = node.recalculate_path()

    def update_children(child_node):
        db_session.add(child_node)
        child_node.path = child_node.recalculate_path()
        for child in child_node.children[:]:
            logger.debug("found child %s", child.id)
            logger.debug("parent has children %s", child_node.children)
            update_children(child)

    if old_parent_id and node.children:
        update_children(node)

    if commit:
        db_session.commit()
        logger.info("committing!")
    def prepare_from_db():
        nodes = (db_session.query(Node).options(joinedload(
            Node.utterances)).filter(
                Node.active.is_(True),
                or_(
                    Node.visited_count > 1,
                    Node.child_count > 0,
                    Node.is_user.is_(False),
                ),
            ).all())
        lookup_table = defaultdict(list)
        node_utts = defaultdict(list)
        node_visit_counts = {}
        id_utt = {}

        for node in nodes:
            lookup_table[node.parent_id].append(node.id)
            node_visit_counts[node.id] = node.visited_count
            for utterance in node.utterances:
                node_utts[node.id].append(utterance.id)
                id_utt[utterance.id] = {"text": utterance.utterance_text}

        linked_nodes = {
            from_node: to_node
            for from_node, to_node in db_session.query(
                LinkedNodes.linked_from_node_id,
                LinkedNodes.linked_to_node_id).all()
        }

        return lookup_table, node_utts, id_utt, node_visit_counts, linked_nodes
Exemple #6
0
def get_next_training_for_worker(external_worker_id):
    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()

    training = db_session.query(Training).filter_by(worker=worker).first()

    if not training or not training.tasks:
        return None

    remaining_tasks = set(TASK_IDS) - set(training.tasks)
    if not remaining_tasks:
        return '__DONE__'

    lowest_id_for_non_perfomed_task = min(remaining_tasks)

    t = list(
        filter(lambda x: x['id'] == lowest_id_for_non_perfomed_task,
               TRAINING_DATA))[0]

    return {
        'id':
        t['id'],
        'history': [
            NodeUtterance(utterance=Utterance(utterance_text=x['text']))
            for x in t['history']
        ],
        'replies':
        t['replies'],
        'description':
        t['description']
    }
def get_node_utterances(node_id):
    node_utterances = (db_session.query(NodeUtterance).filter(
        NodeUtterance.node_id == node_id).all())
    for node_utterance in node_utterances:
        utterances = (db_session.query(Utterance).filter(
            Utterance.id == node_utterance.utterance_id).all())
        for utterance in utterances:
            yield utterance.utterance_text
def activate_node(node_id):
    node = db_session.query(Node).get(node_id)
    node.active = True
    nodes = db_session.query(Node).filter(Node._path.descendant_of(
        node._path)).all()
    for node in nodes:
        node.active = True
    db_session.commit()
def add_yes_no_nodes():
    text_file = open("nodeIDs_final.txt", "r")
    node_ids = text_file.read().split("\n")[:-1]

    nodes = (db_session.query(Node).filter(Node.is_user == False,
                                           Node.id.in_(map(int,
                                                           node_ids))).all())

    # Filter out nodes that already have yes/no children
    nodes_add_yes = []
    nodes_add_no = []
    for node in nodes:
        add_yes, add_no = True, True
        yes_utterances, no_utterances = [], []
        children = db_session.query(Node).filter(
            Node.parent_id == node.id).all()
        for child in children:
            for utterance in get_node_utterances(child.id):
                if re.match(
                        r".*\b(ye(s\b|ah\b|a\b|p\b)|of course|I do\b|absolutely|definitely|ok\b|ok(ay|ey)\b|sounds (good|great)|fine|sure|why not|don't mind|let's do that).*",
                        utterance,
                ):
                    add_yes = False
                if re.match(
                        r".*\b(n(o\bo\b||ot\b|ope\b|a\b|ah\b)|sounds bad|I'm good).*",
                        utterance,
                ):
                    add_no = False
        if add_yes:
            nodes_add_yes.append(node)
        if add_no:
            nodes_add_no.append(node)

    # Add yes/no nodes and write file with new ids for future reference
    added_node_ids = ""
    for node in nodes_add_yes:
        new_node = create_new_node(
            "yes",
            source="yes_no_population",
            parent_id=node,
            species="yes",
            commit=False,
        )
        added_node_ids += (str(new_node.id) + "\t" +
                           list(get_node_utterances(node.id))[0] + "\tyes\n")
    for node in nodes_add_no:
        new_node = create_new_node("no",
                                   source="yes_no_population",
                                   parent_id=node,
                                   species="no",
                                   commit=False)
        added_node_ids += (str(new_node.id) + "\t" +
                           list(get_node_utterances(node.id))[0] + "\tno\n")
    with open("added_node_ids.txt", "w") as the_file:
        the_file.write(added_node_ids)
def merge_by_score():
    nodes = (db_session.query(
        PotentialNodeMerge.left_node_id,
        PotentialNodeMerge.right_node_id,
        PotentialNodeMerge.score,
        Merging,
    ).outerjoin(
        Merging,
        ((PotentialNodeMerge.left_node_id == Merging.left_node_id)
         & (PotentialNodeMerge.right_node_id == Merging.right_node_id)
         | (PotentialNodeMerge.left_node_id == Merging.right_node_id)
         & (PotentialNodeMerge.right_node_id == Merging.left_node_id)),
    ).filter(Merging.id.is_(None)).order_by(
        PotentialNodeMerge.score.desc()).all())

    used_ids = []
    merged_right_nodes = []

    for left_node_id, right_node_id, score, _ in nodes:
        if (f"{left_node_id}-{right_node_id}" not in used_ids
                and left_node_id not in merged_right_nodes
                and right_node_id not in merged_right_nodes):
            used_ids.append(f"{left_node_id}-{right_node_id}")
            used_ids.append(f"{right_node_id}-{left_node_id}")
            print("+------------------+")
            left_node = db_session.query(Node).get(left_node_id)
            right_node = db_session.query(Node).get(right_node_id)
            if (left_node.active and right_node.active and left_node.utterances
                    and right_node.utterances):
                print(left_node_id,
                      [x.utterance_text for x in left_node.utterances])
                print("---------------- VS ----------------")
                print(right_node_id,
                      [x.utterance_text for x in right_node.utterances])

                print("\nscore", score, "\n")

                user_input = input("Merge? Y/n/q ")
                if user_input.lower() == "q":
                    exit()
                elif user_input.lower() == "n":
                    merge_nodes(left_node_id, right_node_id, merged=False)
                    print("nope!")
                elif user_input == "" or user_input.lower() == "y":
                    if right_node.child_count > left_node.child_count:
                        print(right_node.id, "<-", left_node.id)
                        merge_nodes(right_node.id, left_node.id, merged=True)
                        merged_right_nodes.append(left_node.id)
                    else:
                        print(left_node.id, "<-", right_node.id)
                        merge_nodes(left_node.id, right_node.id, merged=True)
                        merged_right_nodes.append(right_node.id)

                db_session.commit()
def inactivate_node(node_id):
    node = db_session.query(Node).get(node_id)
    node.active = False
    nodes = db_session.query(Node).filter(Node._path.descendant_of(
        node._path)).all()
    for node in nodes:
        node.active = False
    db_session.query(PotentialNodeMerge).filter(
        (PotentialNodeMerge.left_node_id == node_id)
        | (PotentialNodeMerge.right_node_id == node_id)).delete()
    db_session.commit()
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions,
               with_audio, used_text_input, assignment_id, hit_id):
    job = get_job(ext_job_id)
    nodes = [x.node for x in job.node_utterances]

    last_node = nodes[-1] if nodes else None
    worker = _create_or_get_worker(external_worker_id)

    node = create_new_node([answer], parent_id=last_node.id, source='typed')
    node_utterance = node.node_utterances[0]
    node_utterance.with_audio = with_audio
    node_utterance.used_text_input = used_text_input

    node_utterance_worker_job = NodeUtteranceWorkerJob(
        node_utterance_id=node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(node_utterance_worker_job)

    for old_node_utterance_id, corrected_text in corrections.items():
        old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id)
        add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction')
        node_utterance_status = NodeUtteranceStatus(
            node_utterance_id=old_node_utterance.id,
            status='corrected'
        )
        db_session.add(node_utterance_status)

    for extra_question in extra_questions:
        if extra_question['type'] != 'api':
            extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id'])
            for status in ['suitable', 'equivalent', 'needs_correction']:
                if extra_question[status]:
                    db_session.add(NodeUtteranceStatus(
                        node_utterance_id=extra_node_utterance.id,
                        referenced_node_utterance_id=node_utterance.id,
                        status=status
                    ))

        else:
            # extra_node_utterance = add_utterance_to_node(
            #     extra_question['text'], node, extra_question['id']
            # )
            pass


    # TODO: set positive scoring for worker and node_utterances
    db_session.commit()
Exemple #13
0
def get_ratings(start_time, end_time):
    if not end_time:
        end_time, = db_session.query(Rating.start_time).filter(
            Rating.start_time.isnot(None)).order_by(
                Rating.start_time.desc()).first()

    if not start_time:
        start_time = (end_time - datetime.timedelta(days=1)).replace(hour=0,
                                                                     minute=0)

    return db_session.query(Rating) \
        .filter(Rating.start_time >= start_time, Rating.start_time <= end_time)\
        .order_by(Rating.start_time.desc())\
        .all()
def linked_nodes(linked_to_node_id, linked_from_node_id):
    linked_to_node = db_session.query(Node).get(linked_to_node_id)
    linked_from_node = db_session.query(Node).get(linked_from_node_id)
    link = (db_session.query(LinkedNodes).filter(
        LinkedNodes.linked_to_node_id == linked_to_node.id,
        LinkedNodes.linked_from_node_id == linked_from_node.id,
    ).first())
    if link:
        return
    if linked_to_node and linked_from_node:
        db_session.add(
            LinkedNodes(
                linked_to_node_id=linked_to_node_id,
                linked_from_node_id=linked_from_node_id,
            ))
    db_session.commit()
def classify_root_nodes():
    from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP

    nodes = (db_session.query(Node).outerjoin(
        RootNode, RootNode.node_id == Node.id).filter(
            Node.active.is_(True), Node.parent_id.is_(None),
            RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all())

    rnc_mlp = RootNodeClassifierMLP()
    for node in nodes:
        print("+----------------------+")
        utterances = [x.utterance_text for x in node.utterances]
        score_results = rnc_mlp.predict_list(utterances)
        for utterance, score in zip(utterances, score_results):
            print(f"{utterance}: {score[0]}")
        print("\navg:", (sum(score_results) / len(score_results))[0], "\n")

        user_input = input("Root node? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input.lower() == "n":
            node.active = False
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=False))
        elif user_input == "" or user_input.lower() == "y":
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=True))
        db_session.commit()
Exemple #16
0
def nodes():
    nodes = db_session.query(Node).filter(Node.parent_id.is_(None),
                                          Node.active.is_(True))
    count = 0
    allNode = []
    returnNode = {}
    for node in nodes:
        allNode.append(node)
    shuffle(allNode)
    for node in allNode:
        utt = []
        tree_nodes = []
        stack = [node]
        while stack:
            cur_node = stack[0]
            stack = stack[1:]
            tree_nodes.append(cur_node)
            for child in cur_node.children:
                stack.append(child)
        for el in tree_nodes:
            for utterances in el.utterances:
                utt.append(utterances.utterance_text)
        returnNode[count] = utt
        count = count + 1
        if count == 30:
            break
    return returnNode
Exemple #17
0
def named_entity_merge(node_id):
    nodes = (db_session.query(Node).options(joinedload(
        Node.utterances), joinedload(
            Node.node_utterances)).filter(Node.parent_id == node_id).all())
    to_merge = []
    categories = ["movies", "musicians", "bands"]

    for node in nodes:
        done = False
        if not node.children:
            for utterance in node.utterances:
                utterance_text = utterance.utterance_text
                # print(utterance_text)
                for category in categories:
                    for item in nem[category]:
                        if f" {item.lower()} " in f" {utterance_text} ":
                            print(f"found {item} in {utterance_text}")
                            to_merge.append(node)
                            done = True
                        if done:
                            break
                    if done:
                        break
                if done:
                    break
    return to_merge
def check_for_worker_eligibilitiy_for_qualification():
    workers = db_session.query(Worker).filter(Worker.has_more_than_20_qualifaction.is_(False), Worker.source == 'mturk').all()
    for worker in workers:
        if worker.job_counts > 20:
            mturk.qualify_worker_for_has_more_than_20_qualification(worker.external_worker_id)
            worker.has_more_than_20_qualifaction = True
            db_session.commit()
def populate(conversation_id=None, automate=False):
    root_node_utterances = _get_utterance_lookup_table(None)
    global file_name
    file_name = (
        FANTOM_WORKDIR + "/fantom_util/fantom_util/graph_tools/possible_root_nodes.txt"
    )
    os.system("aws s3 cp s3://SOME_AWS_BUCKET_URL/possible_root_nodes.txt " + file_name)
    if not conversation_id:
        conversation_ids = (
            db_session.query(Conversation.conversation_id)
            .filter(Conversation.conversation_id != None)
            .distinct(Conversation.conversation_id)
            .all()
        )
        if automate:
            for conversation_id in tqdm(conversation_ids):
                process_conversation(conversation_id, root_node_utterances, automate)
        else:
            stop = False
            while not stop:
                conversation_id = random.choice(conversation_ids)[0]
                process_conversation(conversation_id, root_node_utterances, automate)
                response = input("Continue? Y/n\n")
                if response.lower() == "n":
                    stop = True
    else:
        process_conversation(conversation_id, root_node_utterances, False)

    os.system(
        "aws s3 cp " + file_name + " s3://SOME_AWS_BUCKET_URL/possible_root_nodes.txt"
    )
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id):
    job = get_job(ext_job_id)
    worker = _create_or_get_worker(external_worker_id)

    incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id)

    incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob(
        node_utterance_id=incoherent_node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(incoherent_node_utterance_worker_job)

    node_utterance_status = NodeUtteranceStatus(
        with_audio=with_audio,
        node_utterance_id=incoherent_node_utterance.id,
        status='incoherent'
    )
    db_session.add(node_utterance_status)

    # TODO: set negative scoring for worker and node_utterances

    db_session.commit()
def main(synonym_path):
    synonym_objects = get_synonym_objects(synonym_path)
    root_nodes = (db_session.query(Node).options(
        joinedload(Node.children),
        joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all())
    merge_synonyms(root_nodes, synonym_objects)
    db_session.commit()
def get_graph():
    nodes = db_session.query(Node).filter(Node.active.is_(True)).all()

    max_visits = 0
    for node in nodes:
        max_visits = max(max_visits, node.visited_count)

    output = ['digraph {']
    # For each node
    for node in nodes:
        text = '<br/>'.join(['- {} (nu: {}, u: {})'.format(x.utterance.utterance_text.replace("'", "\\'").replace('"', '\\"'), x.id, x.utterance.id) for x in node.node_utterances])

        # set the color to grey if it is a user utterance otherwise black
        color = '#000000' if node.is_user else '#00ff00'

        thickness = max(1, 4 * node.visited_count/max_visits)

        # print the node with it's properties
        output.append(f'{node.id}[label=<<b>{node.id}</b> ({node.visited_count})<br/>{text}>,color="{color}",penwidth={thickness}];')

        # if the root node is not this nodes parent
        if node.parent_id:
            # print it with a link to it's parent
            output.append(f'{node.parent_id} -> {node.id};')

    output.append('}')
    return ' '.join(output)
def remove_profane_utterances():
    utterances = db_session.query(Utterance).all()
    for utterance in tqdm(utterances):
        text = normalize_text(utterance.utterance_text)
        if text and re.search(EXCLUDED_UTTERANCES, text) and utterance.node_utterances:
            node_ids = [x.node.id for x in utterance.node_utterances if x.node.is_user and x.node.active]
            if node_ids:
                print(text, node_ids)
def read_utterances_from_new_db():
    root_nodes = (db_session.query(Node).options(
        joinedload(Node.children),
        joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all())
    utterances = []
    for root_node in root_nodes:
        utterances += count_utterances(root_node)
    return utterances, len(utterances)
def _create_or_get_worker(external_worker_id, source=None):
    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()
    if not worker:
        worker = Worker(external_worker_id=external_worker_id, source=source)
        db_session.add(worker)
        db_session.flush()
    return worker
def create_jobs(job_type, amount=1):
    if job_type not in ['user', 'system', SPECIES_TAG]:
        raise Exception('work type: "{}" does not exist. Use either system_task or user_task')

    job_filter = [Node.active_child_count == 0, Node.visited_count > 1]
    if job_type == SPECIES_TAG:
        job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3]

    nodes = db_session.query(Node) \
        .filter(
            Node.score > 0,
            Node.is_user == (job_type != 'user'),
            Node.active.is_(True),
            *job_filter
        )\
        .order_by(Node.score.desc()) \
        .all()
    created_jobs = []
    for node in nodes:
        history_ids = node.path[-MAX_DIALOGUE_HISTORY:]
        history = db_session\
            .query(Node)\
            .filter(Node.id.in_(history_ids), Node.active.is_(True))\
            .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\
            .order_by(Node.path_length.asc())\
            .all()
        history_length = len(history)
        if len(history_ids) != history_length:
            logger.warning(f'history_ids != history, {history_ids} != {history}')
            continue

        job_node_utterances = []

        for index, history_node in enumerate(history):
            pool_of_node_utterances = []
            for node_utterance in history_node.node_utterances:
                if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type):
                    pool_of_node_utterances.append(node_utterance)
            if pool_of_node_utterances:
                job_node_utterances.append(random.choice(pool_of_node_utterances))
        if len(history_ids) == len(job_node_utterances):
            job = Job(job_type=job_type, persona_sample=get_persona_sample())
            db_session.add(job)
            db_session.flush()
            for i, node_utterance in enumerate(job_node_utterances):
                db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i))

            created_jobs.append(job)

        if len(created_jobs) == amount:
            break
    db_session.commit()
    print(f'created {len(created_jobs)} jobs')
    return created_jobs
def find_duplicate_utterances():
    duplicate_utterances = db_session.query(Utterance.utterance_text).group_by(Utterance.utterance_text).having(func.count(Utterance.utterance_text) > 1).all()
    for utterance_text, in duplicate_utterances:
        utterances = db_session.query(Utterance).filter(Utterance.utterance_text == utterance_text).all()
        print(utterance_text)
        print([(x.id, len(x.node_utterances)) for x in utterances])
        nu_count = 0
        candidate_utterance = utterances[0]
        for utterance in utterances:
            if utterance.node_utterances:
                candidate_utterance = utterance
                nu_count += 1
        if nu_count == 0 or nu_count == 1:
            print('deleting..')
            for utterance in utterances:
                if utterance.id != candidate_utterance.id:
                    print('removing..', utterance.id)
                    db_session.delete(utterance)
        print('-------------------')
    db_session.commit()
def get_synonym_objects(synonym_path):
    synonym_objects = []
    with open(synonym_path, "r") as f:
        for synonym in f.readlines():
            utterance = (db_session.query(Utterance).filter_by(
                utterance_text=synonym.strip()).first())
            if not utterance:
                utterance = Utterance(utterance_text=synonym.strip())
                db_session.add(utterance)
                db_session.flush()
            synonym_objects.append(utterance)
    return synonym_objects
def add_utterance_to_node(utterance_text, node, source):
    if not isinstance(utterance_text, Utterance):
        utterance = (db_session.query(Utterance).filter_by(
            utterance_text=utterance_text).first())

        if not utterance:
            utterance = Utterance(utterance_text=utterance_text)
            db_session.add(utterance)
            db_session.flush()
    else:
        utterance = utterance_text

    node.utterances.append(utterance)
    db_session.flush()

    node_utterance = (db_session.query(NodeUtterance).filter_by(
        node_id=node.id).filter_by(utterance_id=utterance.id).first())

    node_utterance.source = source
    db_session.flush()
    return node_utterance
def read_utterances_from_new_db():
    root_nodes = (db_session.query(Node).options(
        joinedload(Node.children),
        joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all())
    global features
    global fe
    features = gen_feature_dict(sp.TOPIC, sp.SENTIMENT)
    fe = feature_extractor.FeatureExtractor(features)
    utterance_data = {}
    for root_node in root_nodes:
        utterance_data.update(count_utterances(root_node, True, False, None))
    return utterance_data, len(utterance_data)