def serialize(self): words = [] words.extend(segment(self.choice_1)) words.extend(segment(self.choice_2)) words.extend(segment(self.choice_3)) words.extend(segment(self.choice_4)) return { "id": self.id, "prompt": self.prompt, "choice_1": self.choice_1, "choice_2": self.choice_2, "choice_3": self.choice_3, "choice_4": self.choice_4, "choice_1_correct": self.choice_1_correct, "choice_2_correct": self.choice_2_correct, "choice_3_correct": self.choice_3_correct, "choice_4_correct": self.choice_4_correct, "choice_1_pinyin": pinyin(self.choice_1), "choice_2_pinyin": pinyin(self.choice_2), "choice_3_pinyin": pinyin(self.choice_3), "choice_4_pinyin": pinyin(self.choice_4), "followed_by": self.followed_by, "preceded_by": self.preceded_by, "words": words, "created_at": self.created_at, "updated_at": self.updated_at }
def update(self, key, value): if key == "chinese": self.chinese = value self.pinyin = pinyin(value) self.words = json.dumps(segment(value)) self.words_pinyin = json.dumps( [pinyin(word) for word in segment(value)]) elif key == "other_english_answers": self.other_english_answers = json.dumps(value) else: super(ScribeQuestion, self).update(key, value)
def update(self, key, value): if key == "parts": self.parts = json.dumps(value) words = [] for part in value: if "prompt" in part: words.extend(segment(part["prompt"])) self.words = json.dumps(words)
def update_word_lists(): """Updates the list of new words for all passages, in order of appearance. """ # Retrieve all stories first in order to order the passages correctly stories = Story.query.order_by(Story.position.asc()).all() passage_ids = [] # Create a correctly ordered list of passage ids for story in stories: story_passage_ids = json.loads(story.passage_ids) passage_ids.extend(story_passage_ids) # Get all of the passages by the ids that are in the list passages = Passage.query.filter(Passage.id.in_(passage_ids)).all() # Sort the retrieved passages according to the order of passage_ids, since # the database query doesn't preserve ordering sorted_passages = sorted(passages, key=lambda x: passage_ids.index(x.id)) # Keep track of all words in all passages words = [] for passage in passages: # Get the components of each passage components = json.loads(passage.data)["components"] passage_words = [] # Use jieba to find the words in each text component for component in components: if component["type"] == "text": passage_words.extend([word for word in segment(component["text"])]) # Use numpy to figure out which words have appeared for the first time new_words = np.setdiff1d(passage_words, words).tolist() # Remove punctuation from new words list punctuation_words = ["。", ",", "!", "?"] new_words = [word for word in new_words if word not in punctuation_words] passage.new_words = json.dumps(new_words) # Add the new words to the general words array words.extend(new_words) # Save all changes in MySQL db.session.commit()
def get_scribe_stats(): questions = ScribeQuestion.query.all() words = set() for question in questions: question_words = segment(question.chinese) words.update(question_words) total_entries = len(words) entries = Entry.query.filter(Entry.chinese.in_(words)).all() for entry in entries: words.remove(entry.chinese) return { "name": "Scribe", "completed_entries": total_entries - len(words), "total_entries": total_entries, "completed_questions": len(questions), "total_questions": 200, "needed_entries": list(words) }
def serialize(self): start_range = 0 end_range = 0 try: start_range = self.prompt.index("{") end_range = self.prompt.index("}") - 1 except ValueError: pass prompt_text = self.prompt.replace("{", "").replace("}", "") words = segment(prompt_text) return { "id": self.id, "prompt": self.prompt, "explanation": self.explanation, "correct_sentence": self.correct_sentence, "start_range": start_range, "end_range": end_range, "words": words, "created_at": self.created_at, "updated_at": self.updated_at }
def get_passage(passage_id): """Retrieves a passage with the provided passage id. Args: passage_id: The id of the passage being retrieved. Returns: The JSON data for this passage. """ # Retrieve the passage with this id passage = Passage.query.filter_by(id=passage_id).first() if passage: # Return JSON data if the passage could be found passage_data = passage.serialize() # If someone is logged in, check what their status is for this passage if current_user.is_admin: passage_data["status"] = "completed" else: passage_data["status"] = passage_status_for_user(int(passage_id), current_user.id) # If the user hasn't reached this passage yet, return 403 if passage_data["status"] == "locked": return errors.passage_not_reached() # Extract the words in each text component and add to JSON response for idx, component in enumerate(passage_data["data"]["components"]): if component["type"] == "text": # Separate Chinese sentences into separate words word_generator = segment(component["text"]) words = [{"chinese": word, "punctuation": False} for word in word_generator] # Add pinyin to the word objects pinyin_words = [pinyin(word["chinese"]) for word in words] flattened_pinyin_words = [[j for i in words for j in i] for words in pinyin_words] joined_pinyin_words = ["".join(words) for words in flattened_pinyin_words] # True if the next word in the loop needs to be capitalized capitalize_next_word = True for i, _ in enumerate(words): word = joined_pinyin_words[i] if capitalize_next_word: # Capitalize this word, don't capitalize the next one word = word.capitalize() capitalize_next_word = False # Replace Chinese punctutation with regular punctuation if word == "。": capitalize_next_word = True word = "." words[i]["punctuation"] = True elif word == ",": word = "," words[i]["punctuation"] = True elif word == "!": capitalize_next_word = True word = "!" words[i]["punctuation"] = True elif word == "?": capitalize_next_word = True word = "?" words[i]["punctuation"] = True words[i]["pinyin"] = word # Add the words to the passage data passage_data["data"]["components"][idx]["words"] = words return jsonify(passage_data) else: # Return 404 if this passage doesn't exist return errors.passage_not_found()