def submit(external_worker_id, task_id): if task_id not in TASK_IDS: raise KeyError('Task id not recognized') worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() if not worker: worker = Worker(external_worker_id=external_worker_id) db_session.add(worker) db_session.commit() training = db_session.query(Training).filter_by(worker=worker).first() if not training: training = Training(worker=worker) db_session.add(training) db_session.commit() if task_id == min(set(TASK_IDS) - set(training.tasks)): training.tasks = training.tasks + [task_id] db_session.commit() new_set = set(TASK_IDS) - set(training.tasks) if not new_set: return True else: return False
def classify_root_nodes(): from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP nodes = (db_session.query(Node).outerjoin( RootNode, RootNode.node_id == Node.id).filter( Node.active.is_(True), Node.parent_id.is_(None), RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all()) rnc_mlp = RootNodeClassifierMLP() for node in nodes: print("+----------------------+") utterances = [x.utterance_text for x in node.utterances] score_results = rnc_mlp.predict_list(utterances) for utterance, score in zip(utterances, score_results): print(f"{utterance}: {score[0]}") print("\navg:", (sum(score_results) / len(score_results))[0], "\n") user_input = input("Root node? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": node.active = False for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=False)) elif user_input == "" or user_input.lower() == "y": for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=True)) db_session.commit()
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id): job = get_job(ext_job_id) worker = _create_or_get_worker(external_worker_id) incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id) incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob( node_utterance_id=incoherent_node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(incoherent_node_utterance_worker_job) node_utterance_status = NodeUtteranceStatus( with_audio=with_audio, node_utterance_id=incoherent_node_utterance.id, status='incoherent' ) db_session.add(node_utterance_status) # TODO: set negative scoring for worker and node_utterances db_session.commit()
def update_children(child_node): db_session.add(child_node) child_node.path = child_node.recalculate_path() for child in child_node.children[:]: logger.debug("found child %s", child.id) logger.debug("parent has children %s", child_node.children) update_children(child)
def _create_or_get_worker(external_worker_id, source=None): worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() if not worker: worker = Worker(external_worker_id=external_worker_id, source=source) db_session.add(worker) db_session.flush() return worker
def create_jobs(job_type, amount=1): if job_type not in ['user', 'system', SPECIES_TAG]: raise Exception('work type: "{}" does not exist. Use either system_task or user_task') job_filter = [Node.active_child_count == 0, Node.visited_count > 1] if job_type == SPECIES_TAG: job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3] nodes = db_session.query(Node) \ .filter( Node.score > 0, Node.is_user == (job_type != 'user'), Node.active.is_(True), *job_filter )\ .order_by(Node.score.desc()) \ .all() created_jobs = [] for node in nodes: history_ids = node.path[-MAX_DIALOGUE_HISTORY:] history = db_session\ .query(Node)\ .filter(Node.id.in_(history_ids), Node.active.is_(True))\ .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\ .order_by(Node.path_length.asc())\ .all() history_length = len(history) if len(history_ids) != history_length: logger.warning(f'history_ids != history, {history_ids} != {history}') continue job_node_utterances = [] for index, history_node in enumerate(history): pool_of_node_utterances = [] for node_utterance in history_node.node_utterances: if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type): pool_of_node_utterances.append(node_utterance) if pool_of_node_utterances: job_node_utterances.append(random.choice(pool_of_node_utterances)) if len(history_ids) == len(job_node_utterances): job = Job(job_type=job_type, persona_sample=get_persona_sample()) db_session.add(job) db_session.flush() for i, node_utterance in enumerate(job_node_utterances): db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i)) created_jobs.append(job) if len(created_jobs) == amount: break db_session.commit() print(f'created {len(created_jobs)} jobs') return created_jobs
def get_synonym_objects(synonym_path): synonym_objects = [] with open(synonym_path, "r") as f: for synonym in f.readlines(): utterance = (db_session.query(Utterance).filter_by( utterance_text=synonym.strip()).first()) if not utterance: utterance = Utterance(utterance_text=synonym.strip()) db_session.add(utterance) db_session.flush() synonym_objects.append(utterance) return synonym_objects
def linked_nodes(linked_to_node_id, linked_from_node_id): linked_to_node = db_session.query(Node).get(linked_to_node_id) linked_from_node = db_session.query(Node).get(linked_from_node_id) link = (db_session.query(LinkedNodes).filter( LinkedNodes.linked_to_node_id == linked_to_node.id, LinkedNodes.linked_from_node_id == linked_from_node.id, ).first()) if link: return if linked_to_node and linked_from_node: db_session.add( LinkedNodes( linked_to_node_id=linked_to_node_id, linked_from_node_id=linked_from_node_id, )) db_session.commit()
def update_amazon_anonymous(): if not os.path.exists(ANONYMOUS_UTTERANCE_DIR): os.makedirs(ANONYMOUS_UTTERANCE_DIR) files_to_process = [] for file in list_files_in_s3_bucket_dir(ALEXA_PRIZE_BUCKET_NAME, ANONYMOUS_UTTERANCE_DIR_ON_S3): file_name = file.key.rsplit("/", 1)[1] file_from_s3( ALEXA_PRIZE_BUCKET_NAME, file.key, f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp", ) files_to_process.append(f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp") anonymous_utterances = [] for file_path in files_to_process: with open(file_path, "r") as f: for line in f.readlines(): if re.search(EXCLUDED_UTTERANCES, normalize_text(line)): logger.info("removed utterance: %s", line.strip()) continue anonymous_utterances.append(line.strip().lower()) anonymous_utterances = set(anonymous_utterances) logger.info("anonymous_utterances %d", len(anonymous_utterances)) utterances = db_session.query(Utterance).all() for utterance in utterances: if utterance.utterance_text in anonymous_utterances: if not utterance.amazon_anonymous: logger.info("setting anonymous: %s", utterance.utterance_text) utterance.amazon_anonymous = True anonymous_utterances.remove(utterance.utterance_text) logger.info("anonymous_utterances left %d", len(anonymous_utterances)) logger.info("to be added: %s", anonymous_utterances) for new_utterance in anonymous_utterances: db_session.add( Utterance(utterance_text=new_utterance, amazon_anonymous=True)) db_session.commit() for file_path in files_to_process: os.rename(file_path, file_path[:-4])
def add_utterance_to_node(utterance_text, node, source): if not isinstance(utterance_text, Utterance): utterance = (db_session.query(Utterance).filter_by( utterance_text=utterance_text).first()) if not utterance: utterance = Utterance(utterance_text=utterance_text) db_session.add(utterance) db_session.flush() else: utterance = utterance_text node.utterances.append(utterance) db_session.flush() node_utterance = (db_session.query(NodeUtterance).filter_by( node_id=node.id).filter_by(utterance_id=utterance.id).first()) node_utterance.source = source db_session.flush() return node_utterance
def create_new_node(utterances, source="manual", parent_id=None, commit=False, species=None): if type(parent_id) == Node: parent = parent_id elif parent_id is not None: parent = db_session.query(Node).get(parent_id) else: parent = None node = Node(parent=parent, species=species) db_session.add(node) db_session.flush() node.path = (parent.path if parent else []) + [node.id] if type(utterances) == str: utterances = [utterances] for utterance in utterances: add_utterance_to_node(utterance, node, source) if commit: db_session.commit() return node
def old_anonymize(): while True: utterance, count = (db_session.query( Conversation.user_utterance, func.count(Conversation.user_utterance)).group_by( Conversation.user_utterance).order_by( func.count(Conversation.user_utterance).desc()).filter( ~exists().where(Conversation.user_utterance == AnonymousUtterance.text)).first()) print(count, utterance) user_input = input("Appropriate? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": print("-") db_session.add( AnonymousUtterance(text=utterance, appropriate=False)) elif user_input == "" or user_input.lower() == "y": print("+") db_session.add(AnonymousUtterance(text=utterance, appropriate=True)) db_session.commit()
def _get_utterance_lookup_table(parent): nodes = db_session.query(Node).filter(Node.parent == parent).all() node_utterances = {} for node in nodes: for node_utterance in node.node_utterances: db_session.add(node_utterance) text = normalize_text(node_utterance.utterance.utterance_text) if not node_utterances.get(text): node_utterances[text] = node_utterance elif node_utterances.get(text).node.id != node.id: other_node = node_utterances.get(text).node node_child_size = get_full_child_count(node) other_node_child_size = get_full_child_count(other_node) if node_child_size > other_node_child_size: logger.info(f"merging.. {node.id} <- {other_node.id}") merge_nodes(node.id, other_node.id, True) else: logger.info(f"merging.. {other_node.id} <- {node.id}") merge_nodes(other_node.id, node.id, True) return node_utterances
def split_nodes(node_id): node = db_session.query(Node).get(node_id) print( "which utterance for this node would you like to split (currently only one at a time) (enter to quit)" ) split_utterances = {} for i, node_utterance in enumerate(node.node_utterances): split_utterances[i] = node_utterance print(f"({i}) {node_utterance.utterance.utterance_text}") print("") print("----- CHILDREN -----") for child in node.children: print("*", ", ".join([x.utterance_text for x in child.utterances])) print("-----") print("") utterance_to_split = input(">") if not utterance_to_split: return None split_node_utterance = split_utterances[int(utterance_to_split)] parent = node.parent new_node = create_new_node([], parent_id=parent, source="") split_node_utterance.node = new_node db_session.add(split_node_utterance) db_session.add(node) print("new node with utterance", [x.utterance_text for x in new_node.utterances]) print( "Which children do you want to bring over to the new node (press enter for none, use comma for multiple)" ) move_kids = {} for i, child in enumerate(node.children): move_kids[i] = child kids = ", ".join([x.utterance_text for x in child.utterances]) print(f"({i}) {kids}") print("") kids_to_move = input(">").replace(" ", "") if kids_to_move: splited_kids = [int(x) for x in kids_to_move.split(",")] for kid in splited_kids: move_kids[kid].parent = new_node db_session.add(move_kids[kid]) print("adding", move_kids[kid].id, "to", new_node.id, move_kids[kid].parent.id) print("done") # db_session.rollback() db_session.commit()
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions, with_audio, used_text_input, assignment_id, hit_id): job = get_job(ext_job_id) nodes = [x.node for x in job.node_utterances] last_node = nodes[-1] if nodes else None worker = _create_or_get_worker(external_worker_id) node = create_new_node([answer], parent_id=last_node.id, source='typed') node_utterance = node.node_utterances[0] node_utterance.with_audio = with_audio node_utterance.used_text_input = used_text_input node_utterance_worker_job = NodeUtteranceWorkerJob( node_utterance_id=node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(node_utterance_worker_job) for old_node_utterance_id, corrected_text in corrections.items(): old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id) add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction') node_utterance_status = NodeUtteranceStatus( node_utterance_id=old_node_utterance.id, status='corrected' ) db_session.add(node_utterance_status) for extra_question in extra_questions: if extra_question['type'] != 'api': extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id']) for status in ['suitable', 'equivalent', 'needs_correction']: if extra_question[status]: db_session.add(NodeUtteranceStatus( node_utterance_id=extra_node_utterance.id, referenced_node_utterance_id=node_utterance.id, status=status )) else: # extra_node_utterance = add_utterance_to_node( # extra_question['text'], node, extra_question['id'] # ) pass # TODO: set positive scoring for worker and node_utterances db_session.commit()
def merge_nodes(left_node_id, right_node_id, merged=True): if merged: left_node = db_session.query(Node).get(left_node_id) right_node = db_session.query(Node).get(right_node_id) merge_1 = (db_session.query(Merging).filter( Merging.left_node_id == left_node.id, Merging.right_node_id == right_node.id, ).first()) merge_2 = (db_session.query(Merging).filter( Merging.left_node_id == right_node.id, Merging.right_node_id == left_node.id, ).first()) if merge_1 or merge_2: # or left_node.id == right_node.id: return for node_utterance in right_node.node_utterances[:]: logger.debug( "node has utterance %s %s", node_utterance.id, node_utterance.utterance.utterance_text, ) node_utterance.node = left_node node_utterance.node_id = left_node_id db_session.add(node_utterance) db_session.add( NodeUtteranceStatus(node_utterance_id=node_utterance.id, status="merged")) logger.debug("(before) left node children: %s", left_node.children) logger.debug("(before) right node children: %s", right_node.children) for child in right_node.children[:]: logger.debug("found child %s", child.id) set_parent(child.id, left_node.id) logger.debug("left node children: %s", left_node.children) logger.debug("right node children: %s", right_node.children) left_node.visited_count += right_node.visited_count db_session.commit() the_right_node_id = right_node.id inactivate_node(right_node.id) db_session.commit() db_session.add( Merging( left_node_id=left_node_id, right_node_id=the_right_node_id, merged=merged, )) else: db_session.add( Merging(left_node_id=left_node_id, right_node_id=right_node_id, merged=merged)) db_session.query(PotentialNodeMerge).filter( (PotentialNodeMerge.left_node_id == right_node_id) | (PotentialNodeMerge.right_node_id == right_node_id)).delete() db_session.commit()
file_from_s3(BUCKET_NAME, file.key, f'{PATH_TO_UTTERANCES}/{file_name}.tmp') files_to_process.append(f'{PATH_TO_UTTERANCES}/{file_name}.tmp') anonymous_utterances = [] for file_path in files_to_process: with open(file_path, 'r') as f: for line in f.readlines(): if re.search(EXCLUDED_UTTERANCES, normalize_text(line)): print('removed utterance', line.strip()) continue anonymous_utterances.append(line.strip().lower()) anonymous_utterances = set(anonymous_utterances) print('-----', len(anonymous_utterances)) utterances = db_session.query(Utterance).all() for utterance in utterances: if utterance.utterance_text in anonymous_utterances: if not utterance.amazon_anonymous: print('setting anonymous:', utterance.utterance_text) utterance.amazon_anonymous = True anonymous_utterances.remove(utterance.utterance_text) print('-----', len(anonymous_utterances), anonymous_utterances) for new_utterance in anonymous_utterances: db_session.add( Utterance(utterance_text=new_utterance, amazon_anonymous=True)) db_session.commit() for file_path in files_to_process: os.rename(file_path, file_path[:-4])
def process_conversation(conversation_id, root_node_utterances, automate): conversations = ( db_session.query(Conversation) .filter(Conversation.conversation_id == conversation_id) .order_by(Conversation.interaction_timestamp) .all() ) logger.debug("processing conversation_id: %s", conversation_id) conversation_chunks = [] processed = 0 for conversation in conversations: if conversation.intent == "LaunchRequestIntent": processed += 1 continue if root_node_utterances.get(normalize_text(conversation.user_utterance)): conversation_chunks.append([]) if not conversation_chunks: conversation_chunks = [[]] for conversation_chunk in conversation_chunks: conversation_chunk.append(conversation) if conversation.processed: processed += 1 if processed >= len(conversations): logger.debug("skipping due to all being processed") return None logger.debug([[y.user_utterance for y in x] for x in conversation_chunks]) processed_time = datetime.datetime.now() for conversations in conversation_chunks: parent = None child_nodes = root_node_utterances for idx, conversation in enumerate(conversations): text = normalize_text(conversation.user_utterance) if conversation.intent == "LaunchRequestIntent": logger.debug( "skipping: LaunchRequestIntent: %d %s", idx, conversation.user_utterance, ) continue if not text: logger.debug( "skipping: user utterance is empty: %d %s", idx, conversation.user_utterance, ) continue if re.search(EXCLUDED_UTTERANCES, text) or re.search( EXCLUDED_UTTERANCES, conversation.user_utterance ): logger.debug( "breaking: Detected excluded utterance %s -> %s", conversation.user_utterance, text, ) break logger.debug("- %d %s %s", idx, conversation.user_utterance, text) if parent: child_nodes = _get_utterance_lookup_table(parent) show_kids = str(child_nodes.keys()) if len(child_nodes.keys()) < 4 else "" logger.debug( f"-- Searching among {len(child_nodes.keys())} nodes. %s", show_kids ) node_utterance = child_nodes.get(text) if node_utterance: node = node_utterance.node if not node.active: logger.debug(f"This node ({node.id}) has been marked as inactive.") break logger.debug( "--- Found existing node node_id: %s, node_utterance_id: %s", node.id, node_utterance.id, ) if ( not conversation.processed or conversation.processed == processed_time ): node.visited_count += 1 logger.debug( "---- Increase count for node %s (%d)", node.id, node.visited_count, ) db_session.add(node) conversation.processed = processed_time db_session.add(conversation) parent = None for child in node.children: for utterance in child.utterances: if utterance.id == conversation.graphsearch_matched_utterance_id or normalize_text( utterance.utterance_text ) == normalize_text( conversation.system_utterance ): logger.debug( "----- Found system response: %s", utterance.utterance_text, ) if ( not conversation.processed or conversation.processed == processed_time ): logger.debug( "----- Increase count for child node %s (%d)", child.id, child.visited_count, ) child.visited_count += 1 db_session.add(child) parent = child if parent: break if not parent: logger.debug( "---- No system response found: %s", conversation.system_utterance, ) break else: logger.debug("--- No existing node found") if ( not conversation.processed or conversation.processed == processed_time ): if parent: logger.debug( "--- Adding new node %s", conversation.user_utterance ) node = create_new_node( [conversation.user_utterance], parent_id=parent.id if parent else None, source="automatic_population", ) else: # root_node_utterances[text] = node.node_utterances[0] try: logger.debug( "--- New potential root node: " + conversation.user_utterance ) with open(file_name, "a") as storage_file: logger.debug( "---XXXXXXXXXXXXXXXXXXXXXXX: Adding first utterance to " + file_name ) storage_file.write(conversation.user_utterance + "\n") except: logger.debug("--- File not found") conversation.processed = processed_time break if not automate: response = input("Populate? N/y\n") if response.lower() == "y": db_session.commit() logger.debug("committing!") else: db_session.rollback() else: db_session.commit()