Esempio n. 1
0
def submit(external_worker_id, task_id):
    if task_id not in TASK_IDS:
        raise KeyError('Task id not recognized')

    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()
    if not worker:
        worker = Worker(external_worker_id=external_worker_id)
        db_session.add(worker)
        db_session.commit()

    training = db_session.query(Training).filter_by(worker=worker).first()

    if not training:
        training = Training(worker=worker)
        db_session.add(training)
        db_session.commit()

    if task_id == min(set(TASK_IDS) - set(training.tasks)):
        training.tasks = training.tasks + [task_id]
        db_session.commit()

    new_set = set(TASK_IDS) - set(training.tasks)

    if not new_set:
        return True
    else:
        return False
def classify_root_nodes():
    from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP

    nodes = (db_session.query(Node).outerjoin(
        RootNode, RootNode.node_id == Node.id).filter(
            Node.active.is_(True), Node.parent_id.is_(None),
            RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all())

    rnc_mlp = RootNodeClassifierMLP()
    for node in nodes:
        print("+----------------------+")
        utterances = [x.utterance_text for x in node.utterances]
        score_results = rnc_mlp.predict_list(utterances)
        for utterance, score in zip(utterances, score_results):
            print(f"{utterance}: {score[0]}")
        print("\navg:", (sum(score_results) / len(score_results))[0], "\n")

        user_input = input("Root node? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input.lower() == "n":
            node.active = False
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=False))
        elif user_input == "" or user_input.lower() == "y":
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=True))
        db_session.commit()
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id):
    job = get_job(ext_job_id)
    worker = _create_or_get_worker(external_worker_id)

    incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id)

    incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob(
        node_utterance_id=incoherent_node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(incoherent_node_utterance_worker_job)

    node_utterance_status = NodeUtteranceStatus(
        with_audio=with_audio,
        node_utterance_id=incoherent_node_utterance.id,
        status='incoherent'
    )
    db_session.add(node_utterance_status)

    # TODO: set negative scoring for worker and node_utterances

    db_session.commit()
 def update_children(child_node):
     db_session.add(child_node)
     child_node.path = child_node.recalculate_path()
     for child in child_node.children[:]:
         logger.debug("found child %s", child.id)
         logger.debug("parent has children %s", child_node.children)
         update_children(child)
def _create_or_get_worker(external_worker_id, source=None):
    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()
    if not worker:
        worker = Worker(external_worker_id=external_worker_id, source=source)
        db_session.add(worker)
        db_session.flush()
    return worker
def create_jobs(job_type, amount=1):
    if job_type not in ['user', 'system', SPECIES_TAG]:
        raise Exception('work type: "{}" does not exist. Use either system_task or user_task')

    job_filter = [Node.active_child_count == 0, Node.visited_count > 1]
    if job_type == SPECIES_TAG:
        job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3]

    nodes = db_session.query(Node) \
        .filter(
            Node.score > 0,
            Node.is_user == (job_type != 'user'),
            Node.active.is_(True),
            *job_filter
        )\
        .order_by(Node.score.desc()) \
        .all()
    created_jobs = []
    for node in nodes:
        history_ids = node.path[-MAX_DIALOGUE_HISTORY:]
        history = db_session\
            .query(Node)\
            .filter(Node.id.in_(history_ids), Node.active.is_(True))\
            .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\
            .order_by(Node.path_length.asc())\
            .all()
        history_length = len(history)
        if len(history_ids) != history_length:
            logger.warning(f'history_ids != history, {history_ids} != {history}')
            continue

        job_node_utterances = []

        for index, history_node in enumerate(history):
            pool_of_node_utterances = []
            for node_utterance in history_node.node_utterances:
                if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type):
                    pool_of_node_utterances.append(node_utterance)
            if pool_of_node_utterances:
                job_node_utterances.append(random.choice(pool_of_node_utterances))
        if len(history_ids) == len(job_node_utterances):
            job = Job(job_type=job_type, persona_sample=get_persona_sample())
            db_session.add(job)
            db_session.flush()
            for i, node_utterance in enumerate(job_node_utterances):
                db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i))

            created_jobs.append(job)

        if len(created_jobs) == amount:
            break
    db_session.commit()
    print(f'created {len(created_jobs)} jobs')
    return created_jobs
def get_synonym_objects(synonym_path):
    synonym_objects = []
    with open(synonym_path, "r") as f:
        for synonym in f.readlines():
            utterance = (db_session.query(Utterance).filter_by(
                utterance_text=synonym.strip()).first())
            if not utterance:
                utterance = Utterance(utterance_text=synonym.strip())
                db_session.add(utterance)
                db_session.flush()
            synonym_objects.append(utterance)
    return synonym_objects
def linked_nodes(linked_to_node_id, linked_from_node_id):
    linked_to_node = db_session.query(Node).get(linked_to_node_id)
    linked_from_node = db_session.query(Node).get(linked_from_node_id)
    link = (db_session.query(LinkedNodes).filter(
        LinkedNodes.linked_to_node_id == linked_to_node.id,
        LinkedNodes.linked_from_node_id == linked_from_node.id,
    ).first())
    if link:
        return
    if linked_to_node and linked_from_node:
        db_session.add(
            LinkedNodes(
                linked_to_node_id=linked_to_node_id,
                linked_from_node_id=linked_from_node_id,
            ))
    db_session.commit()
Esempio n. 9
0
def update_amazon_anonymous():
    if not os.path.exists(ANONYMOUS_UTTERANCE_DIR):
        os.makedirs(ANONYMOUS_UTTERANCE_DIR)

    files_to_process = []
    for file in list_files_in_s3_bucket_dir(ALEXA_PRIZE_BUCKET_NAME,
                                            ANONYMOUS_UTTERANCE_DIR_ON_S3):
        file_name = file.key.rsplit("/", 1)[1]
        file_from_s3(
            ALEXA_PRIZE_BUCKET_NAME,
            file.key,
            f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp",
        )
        files_to_process.append(f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp")

    anonymous_utterances = []
    for file_path in files_to_process:
        with open(file_path, "r") as f:
            for line in f.readlines():
                if re.search(EXCLUDED_UTTERANCES, normalize_text(line)):
                    logger.info("removed utterance: %s", line.strip())
                    continue
                anonymous_utterances.append(line.strip().lower())
    anonymous_utterances = set(anonymous_utterances)
    logger.info("anonymous_utterances %d", len(anonymous_utterances))
    utterances = db_session.query(Utterance).all()

    for utterance in utterances:
        if utterance.utterance_text in anonymous_utterances:
            if not utterance.amazon_anonymous:
                logger.info("setting anonymous: %s", utterance.utterance_text)
                utterance.amazon_anonymous = True
            anonymous_utterances.remove(utterance.utterance_text)

    logger.info("anonymous_utterances left %d", len(anonymous_utterances))
    logger.info("to be added: %s", anonymous_utterances)
    for new_utterance in anonymous_utterances:
        db_session.add(
            Utterance(utterance_text=new_utterance, amazon_anonymous=True))

    db_session.commit()
    for file_path in files_to_process:
        os.rename(file_path, file_path[:-4])
Esempio n. 10
0
def add_utterance_to_node(utterance_text, node, source):
    if not isinstance(utterance_text, Utterance):
        utterance = (db_session.query(Utterance).filter_by(
            utterance_text=utterance_text).first())

        if not utterance:
            utterance = Utterance(utterance_text=utterance_text)
            db_session.add(utterance)
            db_session.flush()
    else:
        utterance = utterance_text

    node.utterances.append(utterance)
    db_session.flush()

    node_utterance = (db_session.query(NodeUtterance).filter_by(
        node_id=node.id).filter_by(utterance_id=utterance.id).first())

    node_utterance.source = source
    db_session.flush()
    return node_utterance
Esempio n. 11
0
def create_new_node(utterances,
                    source="manual",
                    parent_id=None,
                    commit=False,
                    species=None):
    if type(parent_id) == Node:
        parent = parent_id
    elif parent_id is not None:
        parent = db_session.query(Node).get(parent_id)
    else:
        parent = None
    node = Node(parent=parent, species=species)
    db_session.add(node)
    db_session.flush()
    node.path = (parent.path if parent else []) + [node.id]
    if type(utterances) == str:
        utterances = [utterances]
    for utterance in utterances:
        add_utterance_to_node(utterance, node, source)
    if commit:
        db_session.commit()
    return node
Esempio n. 12
0
def old_anonymize():
    while True:
        utterance, count = (db_session.query(
            Conversation.user_utterance,
            func.count(Conversation.user_utterance)).group_by(
                Conversation.user_utterance).order_by(
                    func.count(Conversation.user_utterance).desc()).filter(
                        ~exists().where(Conversation.user_utterance ==
                                        AnonymousUtterance.text)).first())
        print(count, utterance)
        user_input = input("Appropriate? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input.lower() == "n":
            print("-")
            db_session.add(
                AnonymousUtterance(text=utterance, appropriate=False))
        elif user_input == "" or user_input.lower() == "y":
            print("+")
            db_session.add(AnonymousUtterance(text=utterance,
                                              appropriate=True))
        db_session.commit()
Esempio n. 13
0
def _get_utterance_lookup_table(parent):
    nodes = db_session.query(Node).filter(Node.parent == parent).all()
    node_utterances = {}
    for node in nodes:
        for node_utterance in node.node_utterances:
            db_session.add(node_utterance)
            text = normalize_text(node_utterance.utterance.utterance_text)
            if not node_utterances.get(text):
                node_utterances[text] = node_utterance
            elif node_utterances.get(text).node.id != node.id:
                other_node = node_utterances.get(text).node

                node_child_size = get_full_child_count(node)
                other_node_child_size = get_full_child_count(other_node)

                if node_child_size > other_node_child_size:
                    logger.info(f"merging.. {node.id} <- {other_node.id}")
                    merge_nodes(node.id, other_node.id, True)
                else:
                    logger.info(f"merging.. {other_node.id} <- {node.id}")
                    merge_nodes(other_node.id, node.id, True)

    return node_utterances
Esempio n. 14
0
def split_nodes(node_id):
    node = db_session.query(Node).get(node_id)

    print(
        "which utterance for this node would you like to split (currently only one at a time) (enter to quit)"
    )
    split_utterances = {}
    for i, node_utterance in enumerate(node.node_utterances):
        split_utterances[i] = node_utterance
        print(f"({i}) {node_utterance.utterance.utterance_text}")
    print("")
    print("----- CHILDREN -----")
    for child in node.children:
        print("*", ", ".join([x.utterance_text for x in child.utterances]))
    print("-----")
    print("")
    utterance_to_split = input(">")
    if not utterance_to_split:
        return None
    split_node_utterance = split_utterances[int(utterance_to_split)]

    parent = node.parent

    new_node = create_new_node([], parent_id=parent, source="")
    split_node_utterance.node = new_node
    db_session.add(split_node_utterance)
    db_session.add(node)

    print("new node with utterance",
          [x.utterance_text for x in new_node.utterances])

    print(
        "Which children do you want to bring over to the new node (press enter for none, use comma for multiple)"
    )
    move_kids = {}
    for i, child in enumerate(node.children):
        move_kids[i] = child
        kids = ", ".join([x.utterance_text for x in child.utterances])
        print(f"({i}) {kids}")
    print("")

    kids_to_move = input(">").replace(" ", "")
    if kids_to_move:
        splited_kids = [int(x) for x in kids_to_move.split(",")]
        for kid in splited_kids:
            move_kids[kid].parent = new_node
            db_session.add(move_kids[kid])
            print("adding", move_kids[kid].id, "to", new_node.id,
                  move_kids[kid].parent.id)
    print("done")
    # db_session.rollback()
    db_session.commit()
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions,
               with_audio, used_text_input, assignment_id, hit_id):
    job = get_job(ext_job_id)
    nodes = [x.node for x in job.node_utterances]

    last_node = nodes[-1] if nodes else None
    worker = _create_or_get_worker(external_worker_id)

    node = create_new_node([answer], parent_id=last_node.id, source='typed')
    node_utterance = node.node_utterances[0]
    node_utterance.with_audio = with_audio
    node_utterance.used_text_input = used_text_input

    node_utterance_worker_job = NodeUtteranceWorkerJob(
        node_utterance_id=node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(node_utterance_worker_job)

    for old_node_utterance_id, corrected_text in corrections.items():
        old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id)
        add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction')
        node_utterance_status = NodeUtteranceStatus(
            node_utterance_id=old_node_utterance.id,
            status='corrected'
        )
        db_session.add(node_utterance_status)

    for extra_question in extra_questions:
        if extra_question['type'] != 'api':
            extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id'])
            for status in ['suitable', 'equivalent', 'needs_correction']:
                if extra_question[status]:
                    db_session.add(NodeUtteranceStatus(
                        node_utterance_id=extra_node_utterance.id,
                        referenced_node_utterance_id=node_utterance.id,
                        status=status
                    ))

        else:
            # extra_node_utterance = add_utterance_to_node(
            #     extra_question['text'], node, extra_question['id']
            # )
            pass


    # TODO: set positive scoring for worker and node_utterances
    db_session.commit()
Esempio n. 16
0
def merge_nodes(left_node_id, right_node_id, merged=True):
    if merged:
        left_node = db_session.query(Node).get(left_node_id)
        right_node = db_session.query(Node).get(right_node_id)

        merge_1 = (db_session.query(Merging).filter(
            Merging.left_node_id == left_node.id,
            Merging.right_node_id == right_node.id,
        ).first())
        merge_2 = (db_session.query(Merging).filter(
            Merging.left_node_id == right_node.id,
            Merging.right_node_id == left_node.id,
        ).first())

        if merge_1 or merge_2:  # or left_node.id == right_node.id:
            return

        for node_utterance in right_node.node_utterances[:]:
            logger.debug(
                "node has utterance %s %s",
                node_utterance.id,
                node_utterance.utterance.utterance_text,
            )
            node_utterance.node = left_node
            node_utterance.node_id = left_node_id
            db_session.add(node_utterance)

            db_session.add(
                NodeUtteranceStatus(node_utterance_id=node_utterance.id,
                                    status="merged"))

        logger.debug("(before) left node children: %s", left_node.children)
        logger.debug("(before) right node children: %s", right_node.children)

        for child in right_node.children[:]:
            logger.debug("found child %s", child.id)
            set_parent(child.id, left_node.id)

        logger.debug("left node children: %s", left_node.children)
        logger.debug("right node children: %s", right_node.children)

        left_node.visited_count += right_node.visited_count
        db_session.commit()

        the_right_node_id = right_node.id
        inactivate_node(right_node.id)
        db_session.commit()
        db_session.add(
            Merging(
                left_node_id=left_node_id,
                right_node_id=the_right_node_id,
                merged=merged,
            ))
    else:
        db_session.add(
            Merging(left_node_id=left_node_id,
                    right_node_id=right_node_id,
                    merged=merged))
        db_session.query(PotentialNodeMerge).filter(
            (PotentialNodeMerge.left_node_id == right_node_id)
            | (PotentialNodeMerge.right_node_id == right_node_id)).delete()
    db_session.commit()
Esempio n. 17
0
    file_from_s3(BUCKET_NAME, file.key,
                 f'{PATH_TO_UTTERANCES}/{file_name}.tmp')
    files_to_process.append(f'{PATH_TO_UTTERANCES}/{file_name}.tmp')

anonymous_utterances = []
for file_path in files_to_process:
    with open(file_path, 'r') as f:
        for line in f.readlines():
            if re.search(EXCLUDED_UTTERANCES, normalize_text(line)):
                print('removed utterance', line.strip())
                continue
            anonymous_utterances.append(line.strip().lower())
anonymous_utterances = set(anonymous_utterances)
print('-----', len(anonymous_utterances))
utterances = db_session.query(Utterance).all()

for utterance in utterances:
    if utterance.utterance_text in anonymous_utterances:
        if not utterance.amazon_anonymous:
            print('setting anonymous:', utterance.utterance_text)
            utterance.amazon_anonymous = True
        anonymous_utterances.remove(utterance.utterance_text)

print('-----', len(anonymous_utterances), anonymous_utterances)
for new_utterance in anonymous_utterances:
    db_session.add(
        Utterance(utterance_text=new_utterance, amazon_anonymous=True))

db_session.commit()
for file_path in files_to_process:
    os.rename(file_path, file_path[:-4])
Esempio n. 18
0
def process_conversation(conversation_id, root_node_utterances, automate):
    conversations = (
        db_session.query(Conversation)
        .filter(Conversation.conversation_id == conversation_id)
        .order_by(Conversation.interaction_timestamp)
        .all()
    )

    logger.debug("processing conversation_id: %s", conversation_id)

    conversation_chunks = []
    processed = 0
    for conversation in conversations:
        if conversation.intent == "LaunchRequestIntent":
            processed += 1
            continue
        if root_node_utterances.get(normalize_text(conversation.user_utterance)):
            conversation_chunks.append([])
        if not conversation_chunks:
            conversation_chunks = [[]]
        for conversation_chunk in conversation_chunks:
            conversation_chunk.append(conversation)
        if conversation.processed:
            processed += 1
    if processed >= len(conversations):
        logger.debug("skipping due to all being processed")
        return None

    logger.debug([[y.user_utterance for y in x] for x in conversation_chunks])
    processed_time = datetime.datetime.now()
    for conversations in conversation_chunks:
        parent = None
        child_nodes = root_node_utterances

        for idx, conversation in enumerate(conversations):
            text = normalize_text(conversation.user_utterance)

            if conversation.intent == "LaunchRequestIntent":
                logger.debug(
                    "skipping: LaunchRequestIntent: %d %s",
                    idx,
                    conversation.user_utterance,
                )
                continue
            if not text:
                logger.debug(
                    "skipping: user utterance is empty: %d %s",
                    idx,
                    conversation.user_utterance,
                )
                continue

            if re.search(EXCLUDED_UTTERANCES, text) or re.search(
                EXCLUDED_UTTERANCES, conversation.user_utterance
            ):
                logger.debug(
                    "breaking:  Detected excluded utterance %s -> %s",
                    conversation.user_utterance,
                    text,
                )
                break

            logger.debug("- %d %s %s", idx, conversation.user_utterance, text)
            if parent:
                child_nodes = _get_utterance_lookup_table(parent)

            show_kids = str(child_nodes.keys()) if len(child_nodes.keys()) < 4 else ""
            logger.debug(
                f"-- Searching among {len(child_nodes.keys())} nodes. %s", show_kids
            )
            node_utterance = child_nodes.get(text)

            if node_utterance:
                node = node_utterance.node
                if not node.active:
                    logger.debug(f"This node ({node.id}) has been marked as inactive.")
                    break
                logger.debug(
                    "--- Found existing node node_id: %s, node_utterance_id: %s",
                    node.id,
                    node_utterance.id,
                )

                if (
                    not conversation.processed
                    or conversation.processed == processed_time
                ):
                    node.visited_count += 1
                    logger.debug(
                        "---- Increase count for node %s (%d)",
                        node.id,
                        node.visited_count,
                    )

                    db_session.add(node)
                    conversation.processed = processed_time
                    db_session.add(conversation)

                parent = None
                for child in node.children:
                    for utterance in child.utterances:
                        if utterance.id == conversation.graphsearch_matched_utterance_id or normalize_text(
                            utterance.utterance_text
                        ) == normalize_text(
                            conversation.system_utterance
                        ):
                            logger.debug(
                                "----- Found system response: %s",
                                utterance.utterance_text,
                            )
                            if (
                                not conversation.processed
                                or conversation.processed == processed_time
                            ):
                                logger.debug(
                                    "----- Increase count for child node %s (%d)",
                                    child.id,
                                    child.visited_count,
                                )
                                child.visited_count += 1
                                db_session.add(child)
                            parent = child
                    if parent:
                        break
                if not parent:
                    logger.debug(
                        "---- No system response found: %s",
                        conversation.system_utterance,
                    )
                    break
            else:
                logger.debug("--- No existing node found")
                if (
                    not conversation.processed
                    or conversation.processed == processed_time
                ):
                    if parent:
                        logger.debug(
                            "--- Adding new node %s", conversation.user_utterance
                        )
                        node = create_new_node(
                            [conversation.user_utterance],
                            parent_id=parent.id if parent else None,
                            source="automatic_population",
                        )
                    else:
                        # root_node_utterances[text] = node.node_utterances[0]
                        try:
                            logger.debug(
                                "--- New potential root node: "
                                + conversation.user_utterance
                            )

                            with open(file_name, "a") as storage_file:
                                logger.debug(
                                    "---XXXXXXXXXXXXXXXXXXXXXXX: Adding first utterance to "
                                    + file_name
                                )
                                storage_file.write(conversation.user_utterance + "\n")
                        except:
                            logger.debug("--- File not found")
                    conversation.processed = processed_time
                break

    if not automate:
        response = input("Populate? N/y\n")
        if response.lower() == "y":
            db_session.commit()
            logger.debug("committing!")
        else:
            db_session.rollback()
    else:
        db_session.commit()