Ejemplo n.º 1
0
def classify_root_nodes():
    from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP

    nodes = (db_session.query(Node).outerjoin(
        RootNode, RootNode.node_id == Node.id).filter(
            Node.active.is_(True), Node.parent_id.is_(None),
            RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all())

    rnc_mlp = RootNodeClassifierMLP()
    for node in nodes:
        print("+----------------------+")
        utterances = [x.utterance_text for x in node.utterances]
        score_results = rnc_mlp.predict_list(utterances)
        for utterance, score in zip(utterances, score_results):
            print(f"{utterance}: {score[0]}")
        print("\navg:", (sum(score_results) / len(score_results))[0], "\n")

        user_input = input("Root node? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input.lower() == "n":
            node.active = False
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=False))
        elif user_input == "" or user_input.lower() == "y":
            for utterance in utterances:
                db_session.add(
                    RootNode(node_id=node.id,
                             utterance=utterance,
                             is_root_node=True))
        db_session.commit()
Ejemplo n.º 2
0
def set_parent(node_id, parent_node_id=None, commit=False):
    node = db_session.query(Node).get(node_id)
    if not node:
        raise Exception("Could not find node")
    if parent_node_id:
        parent_node = db_session.query(Node).get(parent_node_id)
    else:
        parent_node = None

    old_parent_id = None
    if node.parent:
        old_parent_id = node.parent.id

    node.parent = parent_node
    node.path = node.recalculate_path()

    def update_children(child_node):
        db_session.add(child_node)
        child_node.path = child_node.recalculate_path()
        for child in child_node.children[:]:
            logger.debug("found child %s", child.id)
            logger.debug("parent has children %s", child_node.children)
            update_children(child)

    if old_parent_id and node.children:
        update_children(node)

    if commit:
        db_session.commit()
        logger.info("committing!")
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id):
    job = get_job(ext_job_id)
    worker = _create_or_get_worker(external_worker_id)

    incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id)

    incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob(
        node_utterance_id=incoherent_node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(incoherent_node_utterance_worker_job)

    node_utterance_status = NodeUtteranceStatus(
        with_audio=with_audio,
        node_utterance_id=incoherent_node_utterance.id,
        status='incoherent'
    )
    db_session.add(node_utterance_status)

    # TODO: set negative scoring for worker and node_utterances

    db_session.commit()
def main(synonym_path):
    synonym_objects = get_synonym_objects(synonym_path)
    root_nodes = (db_session.query(Node).options(
        joinedload(Node.children),
        joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all())
    merge_synonyms(root_nodes, synonym_objects)
    db_session.commit()
def check_for_worker_eligibilitiy_for_qualification():
    workers = db_session.query(Worker).filter(Worker.has_more_than_20_qualifaction.is_(False), Worker.source == 'mturk').all()
    for worker in workers:
        if worker.job_counts > 20:
            mturk.qualify_worker_for_has_more_than_20_qualification(worker.external_worker_id)
            worker.has_more_than_20_qualifaction = True
            db_session.commit()
Ejemplo n.º 6
0
def incoherent_nodes():
    node_utterance_ids = (db_session.query(
        NodeUtteranceStatus.node_utterance_id,
        func.count(NodeUtteranceStatus.node_utterance_id),
    ).filter(
        NodeUtteranceStatus.status == "incoherent",
        NodeUtteranceStatus.handled.is_(False),
    ).group_by(NodeUtteranceStatus.node_utterance_id).order_by(
        func.count(NodeUtteranceStatus.node_utterance_id).desc()).all())
    for node_utterance_id, count in node_utterance_ids:
        node_utterance = db_session.query(NodeUtterance).get(node_utterance_id)
        history = (db_session.query(Node).filter(
            Node.id.in_(node_utterance.node.path)).order_by(
                Node.path_length.asc()).all())
        print("+----------------------------------+")
        print("node utterance id:", node_utterance_id)
        print("node id:", node_utterance.node.id)
        print("count:", count)
        print("\n")

        for h in history[:-1]:
            print([x.utterance_text for x in h.utterances])
        print(node_utterance.utterance.utterance_text)
        print("\n")
        user_input = input("Inactivate? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input == "" or user_input.lower() == "y":
            inactivate_node(node_utterance.node.id)

        db_session.query(NodeUtteranceStatus).filter(
            NodeUtteranceStatus.node_utterance_id == node_utterance_id,
            NodeUtteranceStatus.status == "incoherent",
        ).update({"handled": True})
        db_session.commit()
Ejemplo n.º 7
0
def activate_node(node_id):
    node = db_session.query(Node).get(node_id)
    node.active = True
    nodes = db_session.query(Node).filter(Node._path.descendant_of(
        node._path)).all()
    for node in nodes:
        node.active = True
    db_session.commit()
def mturk_job(ext_job_id):
    external_worker_id = request.args.get("workerId")
    if external_worker_id != "NO_WORKER_ID":
        job_controller._create_or_get_worker(external_worker_id,
                                             source="mturk")
        db_session.commit()
    submit_url = "{}/mturk/externalSubmit".format(
        request.args.get("turkSubmitTo", ""))
    return show_task(submit_url, ext_job_id, external_worker_id, mturk=True)
Ejemplo n.º 9
0
def merge_by_score():
    nodes = (db_session.query(
        PotentialNodeMerge.left_node_id,
        PotentialNodeMerge.right_node_id,
        PotentialNodeMerge.score,
        Merging,
    ).outerjoin(
        Merging,
        ((PotentialNodeMerge.left_node_id == Merging.left_node_id)
         & (PotentialNodeMerge.right_node_id == Merging.right_node_id)
         | (PotentialNodeMerge.left_node_id == Merging.right_node_id)
         & (PotentialNodeMerge.right_node_id == Merging.left_node_id)),
    ).filter(Merging.id.is_(None)).order_by(
        PotentialNodeMerge.score.desc()).all())

    used_ids = []
    merged_right_nodes = []

    for left_node_id, right_node_id, score, _ in nodes:
        if (f"{left_node_id}-{right_node_id}" not in used_ids
                and left_node_id not in merged_right_nodes
                and right_node_id not in merged_right_nodes):
            used_ids.append(f"{left_node_id}-{right_node_id}")
            used_ids.append(f"{right_node_id}-{left_node_id}")
            print("+------------------+")
            left_node = db_session.query(Node).get(left_node_id)
            right_node = db_session.query(Node).get(right_node_id)
            if (left_node.active and right_node.active and left_node.utterances
                    and right_node.utterances):
                print(left_node_id,
                      [x.utterance_text for x in left_node.utterances])
                print("---------------- VS ----------------")
                print(right_node_id,
                      [x.utterance_text for x in right_node.utterances])

                print("\nscore", score, "\n")

                user_input = input("Merge? Y/n/q ")
                if user_input.lower() == "q":
                    exit()
                elif user_input.lower() == "n":
                    merge_nodes(left_node_id, right_node_id, merged=False)
                    print("nope!")
                elif user_input == "" or user_input.lower() == "y":
                    if right_node.child_count > left_node.child_count:
                        print(right_node.id, "<-", left_node.id)
                        merge_nodes(right_node.id, left_node.id, merged=True)
                        merged_right_nodes.append(left_node.id)
                    else:
                        print(left_node.id, "<-", right_node.id)
                        merge_nodes(left_node.id, right_node.id, merged=True)
                        merged_right_nodes.append(right_node.id)

                db_session.commit()
Ejemplo n.º 10
0
def inactivate_node(node_id):
    node = db_session.query(Node).get(node_id)
    node.active = False
    nodes = db_session.query(Node).filter(Node._path.descendant_of(
        node._path)).all()
    for node in nodes:
        node.active = False
    db_session.query(PotentialNodeMerge).filter(
        (PotentialNodeMerge.left_node_id == node_id)
        | (PotentialNodeMerge.right_node_id == node_id)).delete()
    db_session.commit()
Ejemplo n.º 11
0
def create_jobs(job_type, amount=1):
    if job_type not in ['user', 'system', SPECIES_TAG]:
        raise Exception('work type: "{}" does not exist. Use either system_task or user_task')

    job_filter = [Node.active_child_count == 0, Node.visited_count > 1]
    if job_type == SPECIES_TAG:
        job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3]

    nodes = db_session.query(Node) \
        .filter(
            Node.score > 0,
            Node.is_user == (job_type != 'user'),
            Node.active.is_(True),
            *job_filter
        )\
        .order_by(Node.score.desc()) \
        .all()
    created_jobs = []
    for node in nodes:
        history_ids = node.path[-MAX_DIALOGUE_HISTORY:]
        history = db_session\
            .query(Node)\
            .filter(Node.id.in_(history_ids), Node.active.is_(True))\
            .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\
            .order_by(Node.path_length.asc())\
            .all()
        history_length = len(history)
        if len(history_ids) != history_length:
            logger.warning(f'history_ids != history, {history_ids} != {history}')
            continue

        job_node_utterances = []

        for index, history_node in enumerate(history):
            pool_of_node_utterances = []
            for node_utterance in history_node.node_utterances:
                if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type):
                    pool_of_node_utterances.append(node_utterance)
            if pool_of_node_utterances:
                job_node_utterances.append(random.choice(pool_of_node_utterances))
        if len(history_ids) == len(job_node_utterances):
            job = Job(job_type=job_type, persona_sample=get_persona_sample())
            db_session.add(job)
            db_session.flush()
            for i, node_utterance in enumerate(job_node_utterances):
                db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i))

            created_jobs.append(job)

        if len(created_jobs) == amount:
            break
    db_session.commit()
    print(f'created {len(created_jobs)} jobs')
    return created_jobs
Ejemplo n.º 12
0
def split_nodes(node_id):
    node = db_session.query(Node).get(node_id)

    print(
        "which utterance for this node would you like to split (currently only one at a time) (enter to quit)"
    )
    split_utterances = {}
    for i, node_utterance in enumerate(node.node_utterances):
        split_utterances[i] = node_utterance
        print(f"({i}) {node_utterance.utterance.utterance_text}")
    print("")
    print("----- CHILDREN -----")
    for child in node.children:
        print("*", ", ".join([x.utterance_text for x in child.utterances]))
    print("-----")
    print("")
    utterance_to_split = input(">")
    if not utterance_to_split:
        return None
    split_node_utterance = split_utterances[int(utterance_to_split)]

    parent = node.parent

    new_node = create_new_node([], parent_id=parent, source="")
    split_node_utterance.node = new_node
    db_session.add(split_node_utterance)
    db_session.add(node)

    print("new node with utterance",
          [x.utterance_text for x in new_node.utterances])

    print(
        "Which children do you want to bring over to the new node (press enter for none, use comma for multiple)"
    )
    move_kids = {}
    for i, child in enumerate(node.children):
        move_kids[i] = child
        kids = ", ".join([x.utterance_text for x in child.utterances])
        print(f"({i}) {kids}")
    print("")

    kids_to_move = input(">").replace(" ", "")
    if kids_to_move:
        splited_kids = [int(x) for x in kids_to_move.split(",")]
        for kid in splited_kids:
            move_kids[kid].parent = new_node
            db_session.add(move_kids[kid])
            print("adding", move_kids[kid].id, "to", new_node.id,
                  move_kids[kid].parent.id)
    print("done")
    # db_session.rollback()
    db_session.commit()
Ejemplo n.º 13
0
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions,
               with_audio, used_text_input, assignment_id, hit_id):
    job = get_job(ext_job_id)
    nodes = [x.node for x in job.node_utterances]

    last_node = nodes[-1] if nodes else None
    worker = _create_or_get_worker(external_worker_id)

    node = create_new_node([answer], parent_id=last_node.id, source='typed')
    node_utterance = node.node_utterances[0]
    node_utterance.with_audio = with_audio
    node_utterance.used_text_input = used_text_input

    node_utterance_worker_job = NodeUtteranceWorkerJob(
        node_utterance_id=node_utterance.id,
        worker_id=worker.id,
        job_id=job.id,
        assignment_id=assignment_id,
        hit_id=hit_id
    )
    db_session.add(node_utterance_worker_job)

    for old_node_utterance_id, corrected_text in corrections.items():
        old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id)
        add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction')
        node_utterance_status = NodeUtteranceStatus(
            node_utterance_id=old_node_utterance.id,
            status='corrected'
        )
        db_session.add(node_utterance_status)

    for extra_question in extra_questions:
        if extra_question['type'] != 'api':
            extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id'])
            for status in ['suitable', 'equivalent', 'needs_correction']:
                if extra_question[status]:
                    db_session.add(NodeUtteranceStatus(
                        node_utterance_id=extra_node_utterance.id,
                        referenced_node_utterance_id=node_utterance.id,
                        status=status
                    ))

        else:
            # extra_node_utterance = add_utterance_to_node(
            #     extra_question['text'], node, extra_question['id']
            # )
            pass


    # TODO: set positive scoring for worker and node_utterances
    db_session.commit()
Ejemplo n.º 14
0
def linked_nodes(linked_to_node_id, linked_from_node_id):
    linked_to_node = db_session.query(Node).get(linked_to_node_id)
    linked_from_node = db_session.query(Node).get(linked_from_node_id)
    link = (db_session.query(LinkedNodes).filter(
        LinkedNodes.linked_to_node_id == linked_to_node.id,
        LinkedNodes.linked_from_node_id == linked_from_node.id,
    ).first())
    if link:
        return
    if linked_to_node and linked_from_node:
        db_session.add(
            LinkedNodes(
                linked_to_node_id=linked_to_node_id,
                linked_from_node_id=linked_from_node_id,
            ))
    db_session.commit()
Ejemplo n.º 15
0
def correct_spelling_submit() -> str:
    utterance_id = int(request.form["utterance_id"])
    new_spelling = request.form["new_spelling"]
    corections = request.form.get("corections")
    utterance = db_session.query(Utterance).get(utterance_id)
    utterance.utterance_text = new_spelling
    utterance.is_spellchecked = True
    db_session.commit()
    with open("new_corrections_v2.json", "r") as f:
        new_corrections = json.dumps(
            [x for x in json.loads(f.read()) if x[0] != utterance_id])

    with open("new_corrections_v2.json", "w") as f:
        f.write(new_corrections)

    if corections:
        return redirect(url_for("admin.fix_spelling_issues"))
    return "ok, done! Please reload the page to see your spelling fix"
Ejemplo n.º 16
0
def update_amazon_anonymous():
    if not os.path.exists(ANONYMOUS_UTTERANCE_DIR):
        os.makedirs(ANONYMOUS_UTTERANCE_DIR)

    files_to_process = []
    for file in list_files_in_s3_bucket_dir(ALEXA_PRIZE_BUCKET_NAME,
                                            ANONYMOUS_UTTERANCE_DIR_ON_S3):
        file_name = file.key.rsplit("/", 1)[1]
        file_from_s3(
            ALEXA_PRIZE_BUCKET_NAME,
            file.key,
            f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp",
        )
        files_to_process.append(f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp")

    anonymous_utterances = []
    for file_path in files_to_process:
        with open(file_path, "r") as f:
            for line in f.readlines():
                if re.search(EXCLUDED_UTTERANCES, normalize_text(line)):
                    logger.info("removed utterance: %s", line.strip())
                    continue
                anonymous_utterances.append(line.strip().lower())
    anonymous_utterances = set(anonymous_utterances)
    logger.info("anonymous_utterances %d", len(anonymous_utterances))
    utterances = db_session.query(Utterance).all()

    for utterance in utterances:
        if utterance.utterance_text in anonymous_utterances:
            if not utterance.amazon_anonymous:
                logger.info("setting anonymous: %s", utterance.utterance_text)
                utterance.amazon_anonymous = True
            anonymous_utterances.remove(utterance.utterance_text)

    logger.info("anonymous_utterances left %d", len(anonymous_utterances))
    logger.info("to be added: %s", anonymous_utterances)
    for new_utterance in anonymous_utterances:
        db_session.add(
            Utterance(utterance_text=new_utterance, amazon_anonymous=True))

    db_session.commit()
    for file_path in files_to_process:
        os.rename(file_path, file_path[:-4])
def find_duplicate_utterances():
    duplicate_utterances = db_session.query(Utterance.utterance_text).group_by(Utterance.utterance_text).having(func.count(Utterance.utterance_text) > 1).all()
    for utterance_text, in duplicate_utterances:
        utterances = db_session.query(Utterance).filter(Utterance.utterance_text == utterance_text).all()
        print(utterance_text)
        print([(x.id, len(x.node_utterances)) for x in utterances])
        nu_count = 0
        candidate_utterance = utterances[0]
        for utterance in utterances:
            if utterance.node_utterances:
                candidate_utterance = utterance
                nu_count += 1
        if nu_count == 0 or nu_count == 1:
            print('deleting..')
            for utterance in utterances:
                if utterance.id != candidate_utterance.id:
                    print('removing..', utterance.id)
                    db_session.delete(utterance)
        print('-------------------')
    db_session.commit()
Ejemplo n.º 18
0
def delete_node(node_id):
    node = db_session.query(Node).get(node_id)
    for child in node.children:
        delete_node(child.id)
    for node_utterance in node.node_utterances:
        db_session.query(JobNodeUtterance).filter(
            JobNodeUtterance.node_utterance_id == node_utterance.id).delete()
        db_session.query(NodeUtteranceStatus).filter(
            NodeUtteranceStatus.node_utterance_id ==
            node_utterance.id).delete()
        db_session.query(NodeUtteranceStatus).filter(
            NodeUtteranceStatus.referenced_node_utterance_id ==
            node_utterance.id).delete()
        db_session.query(NodeUtteranceWorkerJob).filter(
            NodeUtteranceWorkerJob.node_utterance_id ==
            node_utterance.id).delete()
        db_session.flush()
        db_session.delete(node_utterance)
    db_session.commit()
    db_session.delete(node)
    db_session.commit()
Ejemplo n.º 19
0
def submit(external_worker_id, task_id):
    if task_id not in TASK_IDS:
        raise KeyError('Task id not recognized')

    worker = db_session.query(Worker) \
        .filter_by(external_worker_id=external_worker_id) \
        .first()
    if not worker:
        worker = Worker(external_worker_id=external_worker_id)
        db_session.add(worker)
        db_session.commit()

    training = db_session.query(Training).filter_by(worker=worker).first()

    if not training:
        training = Training(worker=worker)
        db_session.add(training)
        db_session.commit()

    if task_id == min(set(TASK_IDS) - set(training.tasks)):
        training.tasks = training.tasks + [task_id]
        db_session.commit()

    new_set = set(TASK_IDS) - set(training.tasks)

    if not new_set:
        return True
    else:
        return False
Ejemplo n.º 20
0
def create_new_node(utterances,
                    source="manual",
                    parent_id=None,
                    commit=False,
                    species=None):
    if type(parent_id) == Node:
        parent = parent_id
    elif parent_id is not None:
        parent = db_session.query(Node).get(parent_id)
    else:
        parent = None
    node = Node(parent=parent, species=species)
    db_session.add(node)
    db_session.flush()
    node.path = (parent.path if parent else []) + [node.id]
    if type(utterances) == str:
        utterances = [utterances]
    for utterance in utterances:
        add_utterance_to_node(utterance, node, source)
    if commit:
        db_session.commit()
    return node
Ejemplo n.º 21
0
def old_anonymize():
    while True:
        utterance, count = (db_session.query(
            Conversation.user_utterance,
            func.count(Conversation.user_utterance)).group_by(
                Conversation.user_utterance).order_by(
                    func.count(Conversation.user_utterance).desc()).filter(
                        ~exists().where(Conversation.user_utterance ==
                                        AnonymousUtterance.text)).first())
        print(count, utterance)
        user_input = input("Appropriate? Y/n/q ")
        if user_input.lower() == "q":
            exit()
        elif user_input.lower() == "n":
            print("-")
            db_session.add(
                AnonymousUtterance(text=utterance, appropriate=False))
        elif user_input == "" or user_input.lower() == "y":
            print("+")
            db_session.add(AnonymousUtterance(text=utterance,
                                              appropriate=True))
        db_session.commit()
def fix_utterances_starting_with_alexa():
    utterances = db_session.query(Utterance)\
        .filter(
            Utterance.node_utterances.any(),
            or_(Utterance.utterance_text.like('alexa %'),
                Utterance.utterance_text.like('amazon %'),
                Utterance.utterance_text.like('echo %'),
                Utterance.utterance_text.like('computer %')
            )
    ).all()

    for utterance in utterances:
        nul = len(utterance.node_utterances)

        alternative_utterance = db_session.query(Utterance).filter(Utterance.utterance_text == normalize_text(utterance.utterance_text)).first()
        if alternative_utterance:
            for node_utterance in utterance.node_utterances:
                print('==', node_utterance.id)
                node_utterance.utterance = alternative_utterance
            print(utterance.utterance_text, '->', alternative_utterance.utterance_text, f'({nul})', utterance.id, alternative_utterance.id)
        else:
            print(normalize_text(utterance.utterance_text), utterance.utterance_text, 'HAS NO', f'({nul})')
    db_session.commit()
Ejemplo n.º 23
0
def exact_match():
    merges = db_session.query(Merging).all()
    used_nodes = []

    for merge in merges:
        used_nodes.append(f"{merge.left_node_id}--{merge.right_node_id}")

    nodes = (db_session.query(Node).filter(Node.active == True).order_by(
        Node.parent_id.desc()).all())
    grouped_nodes = defaultdict(list)

    for node in nodes:
        grouped_nodes[node.parent_id].append(node)

    bar = progressbar.ProgressBar()
    for group, grouped_nodes in bar(grouped_nodes.items()):
        for i, left_node in enumerate(grouped_nodes):
            for j, right_node in enumerate(grouped_nodes):
                if (i != j and f"{left_node.id}--{right_node.id}"
                        not in used_nodes and
                        f"{right_node.id}--{left_node.id}" not in used_nodes):
                    used_nodes.append(f"{left_node.id}--{right_node.id}")
                    for left_utterance in left_node.utterances:
                        for right_utterance in right_node.utterances:
                            do_continue = True
                            if (left_utterance.utterance_text == ""
                                    or left_utterance.utterance_text == " "):
                                print(
                                    "removing empty utterance",
                                    left_utterance.utterance_text,
                                    left_utterance.id,
                                )
                                if left_node.children:
                                    raise Exception(
                                        "empty string has children. WAT?! :S")
                                db_session.remove(left_utterance)
                                db_session.flush()
                                if not left_node.utterances:
                                    print("removing node", left_node.id)
                                    db_session.remove(left_node)
                                do_continue = False

                            if (right_utterance.utterance_text == ""
                                    or right_utterance.utterance_text == " "):
                                print(
                                    "removing empty utterance",
                                    right_utterance.utterance_text,
                                    right_utterance.id,
                                )
                                if right_node.children:
                                    raise Exception(
                                        "empty string has children. WAT?! :S")
                                db_session.remove(right_utterance)
                                db_session.flush()
                                if not right_node.utterances:
                                    print("removing node", right_node.id)
                                    db_session.remove(right_node)
                                do_continue = False

                            if (do_continue
                                    and left_utterance.utterance_text.lower()
                                    == right_utterance.utterance_text.lower()):
                                # print('merge', left_utterance.utterance_text, right_utterance.utterance_text)
                                merge_nodes(left_node.id, right_node.id, True)

    db_session.commit()
def fix_visited_count():
    nodes = db_session.query(Node).options(joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()
    #nodes = [db_session.query(Node).get(648610)]
    _check_kids(nodes, '', 0)
    db_session.commit()
def fix_root_visited_count():
    nodes = db_session.query(Node).options(joinedload(Node.children)).filter(Node.parent_id.is_(None)).all()
    for node in tqdm(nodes):
        node.visited_count = sum([child.visited_count for child in node.children]) or 1
        db_session.commit()
Ejemplo n.º 26
0
def process_conversation(conversation_id, root_node_utterances, automate):
    conversations = (
        db_session.query(Conversation)
        .filter(Conversation.conversation_id == conversation_id)
        .order_by(Conversation.interaction_timestamp)
        .all()
    )

    logger.debug("processing conversation_id: %s", conversation_id)

    conversation_chunks = []
    processed = 0
    for conversation in conversations:
        if conversation.intent == "LaunchRequestIntent":
            processed += 1
            continue
        if root_node_utterances.get(normalize_text(conversation.user_utterance)):
            conversation_chunks.append([])
        if not conversation_chunks:
            conversation_chunks = [[]]
        for conversation_chunk in conversation_chunks:
            conversation_chunk.append(conversation)
        if conversation.processed:
            processed += 1
    if processed >= len(conversations):
        logger.debug("skipping due to all being processed")
        return None

    logger.debug([[y.user_utterance for y in x] for x in conversation_chunks])
    processed_time = datetime.datetime.now()
    for conversations in conversation_chunks:
        parent = None
        child_nodes = root_node_utterances

        for idx, conversation in enumerate(conversations):
            text = normalize_text(conversation.user_utterance)

            if conversation.intent == "LaunchRequestIntent":
                logger.debug(
                    "skipping: LaunchRequestIntent: %d %s",
                    idx,
                    conversation.user_utterance,
                )
                continue
            if not text:
                logger.debug(
                    "skipping: user utterance is empty: %d %s",
                    idx,
                    conversation.user_utterance,
                )
                continue

            if re.search(EXCLUDED_UTTERANCES, text) or re.search(
                EXCLUDED_UTTERANCES, conversation.user_utterance
            ):
                logger.debug(
                    "breaking:  Detected excluded utterance %s -> %s",
                    conversation.user_utterance,
                    text,
                )
                break

            logger.debug("- %d %s %s", idx, conversation.user_utterance, text)
            if parent:
                child_nodes = _get_utterance_lookup_table(parent)

            show_kids = str(child_nodes.keys()) if len(child_nodes.keys()) < 4 else ""
            logger.debug(
                f"-- Searching among {len(child_nodes.keys())} nodes. %s", show_kids
            )
            node_utterance = child_nodes.get(text)

            if node_utterance:
                node = node_utterance.node
                if not node.active:
                    logger.debug(f"This node ({node.id}) has been marked as inactive.")
                    break
                logger.debug(
                    "--- Found existing node node_id: %s, node_utterance_id: %s",
                    node.id,
                    node_utterance.id,
                )

                if (
                    not conversation.processed
                    or conversation.processed == processed_time
                ):
                    node.visited_count += 1
                    logger.debug(
                        "---- Increase count for node %s (%d)",
                        node.id,
                        node.visited_count,
                    )

                    db_session.add(node)
                    conversation.processed = processed_time
                    db_session.add(conversation)

                parent = None
                for child in node.children:
                    for utterance in child.utterances:
                        if utterance.id == conversation.graphsearch_matched_utterance_id or normalize_text(
                            utterance.utterance_text
                        ) == normalize_text(
                            conversation.system_utterance
                        ):
                            logger.debug(
                                "----- Found system response: %s",
                                utterance.utterance_text,
                            )
                            if (
                                not conversation.processed
                                or conversation.processed == processed_time
                            ):
                                logger.debug(
                                    "----- Increase count for child node %s (%d)",
                                    child.id,
                                    child.visited_count,
                                )
                                child.visited_count += 1
                                db_session.add(child)
                            parent = child
                    if parent:
                        break
                if not parent:
                    logger.debug(
                        "---- No system response found: %s",
                        conversation.system_utterance,
                    )
                    break
            else:
                logger.debug("--- No existing node found")
                if (
                    not conversation.processed
                    or conversation.processed == processed_time
                ):
                    if parent:
                        logger.debug(
                            "--- Adding new node %s", conversation.user_utterance
                        )
                        node = create_new_node(
                            [conversation.user_utterance],
                            parent_id=parent.id if parent else None,
                            source="automatic_population",
                        )
                    else:
                        # root_node_utterances[text] = node.node_utterances[0]
                        try:
                            logger.debug(
                                "--- New potential root node: "
                                + conversation.user_utterance
                            )

                            with open(file_name, "a") as storage_file:
                                logger.debug(
                                    "---XXXXXXXXXXXXXXXXXXXXXXX: Adding first utterance to "
                                    + file_name
                                )
                                storage_file.write(conversation.user_utterance + "\n")
                        except:
                            logger.debug("--- File not found")
                    conversation.processed = processed_time
                break

    if not automate:
        response = input("Populate? N/y\n")
        if response.lower() == "y":
            db_session.commit()
            logger.debug("committing!")
        else:
            db_session.rollback()
    else:
        db_session.commit()
Ejemplo n.º 27
0
    file_from_s3(BUCKET_NAME, file.key,
                 f'{PATH_TO_UTTERANCES}/{file_name}.tmp')
    files_to_process.append(f'{PATH_TO_UTTERANCES}/{file_name}.tmp')

anonymous_utterances = []
for file_path in files_to_process:
    with open(file_path, 'r') as f:
        for line in f.readlines():
            if re.search(EXCLUDED_UTTERANCES, normalize_text(line)):
                print('removed utterance', line.strip())
                continue
            anonymous_utterances.append(line.strip().lower())
anonymous_utterances = set(anonymous_utterances)
print('-----', len(anonymous_utterances))
utterances = db_session.query(Utterance).all()

for utterance in utterances:
    if utterance.utterance_text in anonymous_utterances:
        if not utterance.amazon_anonymous:
            print('setting anonymous:', utterance.utterance_text)
            utterance.amazon_anonymous = True
        anonymous_utterances.remove(utterance.utterance_text)

print('-----', len(anonymous_utterances), anonymous_utterances)
for new_utterance in anonymous_utterances:
    db_session.add(
        Utterance(utterance_text=new_utterance, amazon_anonymous=True))

db_session.commit()
for file_path in files_to_process:
    os.rename(file_path, file_path[:-4])
Ejemplo n.º 28
0
def merge_nodes(left_node_id, right_node_id, merged=True):
    if merged:
        left_node = db_session.query(Node).get(left_node_id)
        right_node = db_session.query(Node).get(right_node_id)

        merge_1 = (db_session.query(Merging).filter(
            Merging.left_node_id == left_node.id,
            Merging.right_node_id == right_node.id,
        ).first())
        merge_2 = (db_session.query(Merging).filter(
            Merging.left_node_id == right_node.id,
            Merging.right_node_id == left_node.id,
        ).first())

        if merge_1 or merge_2:  # or left_node.id == right_node.id:
            return

        for node_utterance in right_node.node_utterances[:]:
            logger.debug(
                "node has utterance %s %s",
                node_utterance.id,
                node_utterance.utterance.utterance_text,
            )
            node_utterance.node = left_node
            node_utterance.node_id = left_node_id
            db_session.add(node_utterance)

            db_session.add(
                NodeUtteranceStatus(node_utterance_id=node_utterance.id,
                                    status="merged"))

        logger.debug("(before) left node children: %s", left_node.children)
        logger.debug("(before) right node children: %s", right_node.children)

        for child in right_node.children[:]:
            logger.debug("found child %s", child.id)
            set_parent(child.id, left_node.id)

        logger.debug("left node children: %s", left_node.children)
        logger.debug("right node children: %s", right_node.children)

        left_node.visited_count += right_node.visited_count
        db_session.commit()

        the_right_node_id = right_node.id
        inactivate_node(right_node.id)
        db_session.commit()
        db_session.add(
            Merging(
                left_node_id=left_node_id,
                right_node_id=the_right_node_id,
                merged=merged,
            ))
    else:
        db_session.add(
            Merging(left_node_id=left_node_id,
                    right_node_id=right_node_id,
                    merged=merged))
        db_session.query(PotentialNodeMerge).filter(
            (PotentialNodeMerge.left_node_id == right_node_id)
            | (PotentialNodeMerge.right_node_id == right_node_id)).delete()
    db_session.commit()