Python segment 예제들, app.chinese.segment Python 예제들

예제 #1

0

파일 보기

파일: models.py 프로젝트: storytimeworks/backend

    def serialize(self):
        words = []
        words.extend(segment(self.choice_1))
        words.extend(segment(self.choice_2))
        words.extend(segment(self.choice_3))
        words.extend(segment(self.choice_4))

        return {
            "id": self.id,
            "prompt": self.prompt,
            "choice_1": self.choice_1,
            "choice_2": self.choice_2,
            "choice_3": self.choice_3,
            "choice_4": self.choice_4,
            "choice_1_correct": self.choice_1_correct,
            "choice_2_correct": self.choice_2_correct,
            "choice_3_correct": self.choice_3_correct,
            "choice_4_correct": self.choice_4_correct,
            "choice_1_pinyin": pinyin(self.choice_1),
            "choice_2_pinyin": pinyin(self.choice_2),
            "choice_3_pinyin": pinyin(self.choice_3),
            "choice_4_pinyin": pinyin(self.choice_4),
            "followed_by": self.followed_by,
            "preceded_by": self.preceded_by,
            "words": words,
            "created_at": self.created_at,
            "updated_at": self.updated_at
        }

예제 #2

0

파일 보기

파일: models.py 프로젝트: storytimeworks/backend

 def update(self, key, value):
     if key == "chinese":
         self.chinese = value
         self.pinyin = pinyin(value)
         self.words = json.dumps(segment(value))
         self.words_pinyin = json.dumps(
             [pinyin(word) for word in segment(value)])
     elif key == "other_english_answers":
         self.other_english_answers = json.dumps(value)
     else:
         super(ScribeQuestion, self).update(key, value)

예제 #3

0

파일 보기

파일: models.py 프로젝트: storytimeworks/backend

    def update(self, key, value):
        if key == "parts":
            self.parts = json.dumps(value)

            words = []

            for part in value:
                if "prompt" in part:
                    words.extend(segment(part["prompt"]))

            self.words = json.dumps(words)

예제 #4

0

파일 보기

파일: controllers.py 프로젝트: storytimeworks/backend

def update_word_lists():
    """Updates the list of new words for all passages, in order of appearance.
    """

    # Retrieve all stories first in order to order the passages correctly
    stories = Story.query.order_by(Story.position.asc()).all()
    passage_ids = []

    # Create a correctly ordered list of passage ids
    for story in stories:
        story_passage_ids = json.loads(story.passage_ids)
        passage_ids.extend(story_passage_ids)

    # Get all of the passages by the ids that are in the list
    passages = Passage.query.filter(Passage.id.in_(passage_ids)).all()

    # Sort the retrieved passages according to the order of passage_ids, since
    # the database query doesn't preserve ordering
    sorted_passages = sorted(passages, key=lambda x: passage_ids.index(x.id))

    # Keep track of all words in all passages
    words = []

    for passage in passages:
        # Get the components of each passage
        components = json.loads(passage.data)["components"]
        passage_words = []

        # Use jieba to find the words in each text component
        for component in components:
            if component["type"] == "text":
                passage_words.extend([word for word in segment(component["text"])])

        # Use numpy to figure out which words have appeared for the first time
        new_words = np.setdiff1d(passage_words, words).tolist()

        # Remove punctuation from new words list
        punctuation_words = ["。", "，", "！", "？"]
        new_words = [word for word in new_words if word not in punctuation_words]

        passage.new_words = json.dumps(new_words)

        # Add the new words to the general words array
        words.extend(new_words)

    # Save all changes in MySQL
    db.session.commit()

예제 #5

0

파일 보기

def get_scribe_stats():
    questions = ScribeQuestion.query.all()

    words = set()

    for question in questions:
        question_words = segment(question.chinese)
        words.update(question_words)

    total_entries = len(words)

    entries = Entry.query.filter(Entry.chinese.in_(words)).all()

    for entry in entries:
        words.remove(entry.chinese)

    return {
        "name": "Scribe",
        "completed_entries": total_entries - len(words),
        "total_entries": total_entries,
        "completed_questions": len(questions),
        "total_questions": 200,
        "needed_entries": list(words)
    }

예제 #6

0

파일 보기

파일: models.py 프로젝트: storytimeworks/backend

    def serialize(self):
        start_range = 0
        end_range = 0

        try:
            start_range = self.prompt.index("{")
            end_range = self.prompt.index("}") - 1
        except ValueError:
            pass

        prompt_text = self.prompt.replace("{", "").replace("}", "")
        words = segment(prompt_text)

        return {
            "id": self.id,
            "prompt": self.prompt,
            "explanation": self.explanation,
            "correct_sentence": self.correct_sentence,
            "start_range": start_range,
            "end_range": end_range,
            "words": words,
            "created_at": self.created_at,
            "updated_at": self.updated_at
        }

예제 #7

0

파일 보기

파일: controllers.py 프로젝트: storytimeworks/backend

def get_passage(passage_id):
    """Retrieves a passage with the provided passage id.

    Args:
        passage_id: The id of the passage being retrieved.

    Returns:
        The JSON data for this passage.
    """

    # Retrieve the passage with this id
    passage = Passage.query.filter_by(id=passage_id).first()

    if passage:
        # Return JSON data if the passage could be found
        passage_data = passage.serialize()

        # If someone is logged in, check what their status is for this passage
        if current_user.is_admin:
            passage_data["status"] = "completed"
        else:
            passage_data["status"] = passage_status_for_user(int(passage_id), current_user.id)

            # If the user hasn't reached this passage yet, return 403
            if passage_data["status"] == "locked":
                return errors.passage_not_reached()

        # Extract the words in each text component and add to JSON response
        for idx, component in enumerate(passage_data["data"]["components"]):
            if component["type"] == "text":
                # Separate Chinese sentences into separate words
                word_generator = segment(component["text"])
                words = [{"chinese": word, "punctuation": False} for word in word_generator]

                # Add pinyin to the word objects
                pinyin_words = [pinyin(word["chinese"]) for word in words]
                flattened_pinyin_words = [[j for i in words for j in i] for words in pinyin_words]
                joined_pinyin_words = ["".join(words) for words in flattened_pinyin_words]

                # True if the next word in the loop needs to be capitalized
                capitalize_next_word = True

                for i, _ in enumerate(words):
                    word = joined_pinyin_words[i]

                    if capitalize_next_word:
                        # Capitalize this word, don't capitalize the next one
                        word = word.capitalize()
                        capitalize_next_word = False

                    # Replace Chinese punctutation with regular punctuation
                    if word == "。":
                        capitalize_next_word = True
                        word = "."
                        words[i]["punctuation"] = True
                    elif word == "，":
                        word = ","
                        words[i]["punctuation"] = True
                    elif word == "！":
                        capitalize_next_word = True
                        word = "!"
                        words[i]["punctuation"] = True
                    elif word == "？":
                        capitalize_next_word = True
                        word = "?"
                        words[i]["punctuation"] = True

                    words[i]["pinyin"] = word

                # Add the words to the passage data
                passage_data["data"]["components"][idx]["words"] = words

        return jsonify(passage_data)
    else:
        # Return 404 if this passage doesn't exist
        return errors.passage_not_found()