Beispiel #1
0
    def parse(self):
        for file_name in os.listdir(self.data_dir):
            data_file = open(self.data_dir + file_name, "r")

            title = ""
            while not title:
                title = data_file.readline().strip()
                title = " ".join(word[0].upper() + word[1:].lower() for word in title.split())

            data_file.seek(0)
            for line in data_file:
                line = line.strip()
                if line:
                    words = set(map(string.lower, re.sub("[^\w]", " ", line).split()))
                    for word in words:
                        word_data = Word.get_by_id(word, parent=self.parent)

                        if not word_data:
                            word_data = Word(parent=self.parent, id=word, name=word)
                        new_mention = Mention(line=line, work=title)
                        if word_data.mentions:
                            word_data.mentions.append(new_mention)
                        else:
                            word_data.mentions = [new_mention]
                        word_data.put()
def _get_word_mentions_by_char(word_name, work_title, char_name):
    """Get the words that a said by a character of a certain work

    Args:
        word_name: the string of the word being searched (lowercase).
        work_title: the title of the work in which the character appears
            (titlecase).
        char_name: the name of the character (titlecase).

    Returns:
        A dictionary indexed by the work and the characters. This redundant data
        is created in order to comply with the data pattern.
    """

    word = Word.get_by_id(word_name)
    if not word:
        return {}, 0
    work = Work.get_by_id(work_title, parent=word.key)
    if not work:
        return {}, 0
    char = Character.get_by_id(char_name, parent=work.key)
    if not char:
        return {}, 0
    mentions = char.get_string_mentions()
    bold_mentions = _bold_mentions(word_name, mentions)
    mentions_dict = {work_title: {char_name: bold_mentions}}
    return mentions_dict, char.count
def get_suggestion(word):
    '''Gets the most used suggestion for a misspelled word.

    The suggestion must exist in the database and must have distance 1 to the
    input word.

    There is more than one suggestion at distance 1 that exists in the database,
    it chooses the one that appears the database.

    Args:
        word: misspelled word.

    Returns:
        A suggestion of this word for the user or None if it doesn't find any.
    '''
    if Word.get_by_id(word):
        return None

    candidates = _select_valid_words(_words_edit_distance_one(word))
    best_count = 0
    suggestion = None
    for candidate in candidates:
        if candidate.count > best_count:
            best_count = candidate.count
            suggestion = candidate
    if suggestion:
        return suggestion.name
    return None
def _get_word_mentions_in_work(word_name, work_title):
    """Get all mentions of a word that appear in a certain work.

    Args:
        word_name: the string of the word being searched (lowercase).
        work_title: the title of the work (titlecase).

    Returns:
        A dictionary first indexed by work and second by character. The work is
        inserted to comply with the data pattern.
    """

    word = Word.get_by_id(word_name)
    if not word:
        return {}, 0
    work = Work.get_by_id(work_title, parent=word.key)
    if not work:
        return {}, 0
    chars = Character.query(ancestor=work.key).fetch()
    mentions_dict = {work_title: {}}
    for char in chars:
        mentions = char.get_string_mentions()
        bold_mentions = _bold_mentions(word_name, mentions)
        mentions_dict[work_title][char.name] = bold_mentions
    return mentions_dict, work.count
    def test_filter_entities_using_query_works(self):
        '''We can search for all the entities starting from a word.'''
        retrieved_word = Word.get_by_id("death")  
        self.assertEqual('death', retrieved_word.name)
        self.assertEqual(2, retrieved_word.count)

        retrieved_works = Work.query(ancestor=self.word.key).fetch()
        self.assertEqual(len(retrieved_works), 1)
        work = retrieved_works[0]

        retrieved_character = Character.query(ancestor=work.key).fetch()
        self.assertEqual(len(retrieved_character), 1)
        char = retrieved_character[0]
        self.assertEqual(1, len(char.mentions))
        self.assertEqual("Though yet of Hamlet our dear brother's death", 
            char.mentions[0].get().line)
def _get_word_works(word_name):
    """Retrieves all the works in which a word occurs.

    Args:
        word_name: the word (lowercase).

    Returns:
        A list with the titles of the works.
    """

    word_db = Word.get_by_id(word_name)
    if not word_db:
        return []
    work_titles = [work_db.title for work_db in
        Work.query(ancestor=word_db.key).fetch()]
    return work_titles
def _get_work_characters(word_name, work_title):
    """Retrieves all the characters that mentions a word in a given work.

    Args:
        word_name: the string of the word which the characters mention
            (lowercase).
        work_title: the title of the work of interest (titlecase).

    Returns:
        A list with the names of the characters.
    """

    word_db = Word.get_by_id(word_name)
    if not word_db:
        return []
    work_db = Work.get_by_id(work_title, parent=word_db.key)
    if not work_db:
        return []
    char_names = [char_db.name for char_db in
        Character.query(ancestor=work_db.key).fetch()]
    return char_names
    def get(self):
        """Retrieves formatted information to the treemap visualization. It
           expects a list of elements, and each element is a list of the
           following type:

           [name, parent's name, value, color value]

           In which name and parent's name are strings, value is an integer
           proportional to the size of the resulting rectangle on the treemap
           and color value is the value to be used as color acording to the
           color range.

           It is called the function get_all_word_mentions to obtain a
           dictionary that maps from work and character to mentions.
        """
        searched_value = cgi.escape(self.request.get('searched_word').lower())

        if not searched_value:
            return
        
        all_mentions, count = _get_all_word_mentions(searched_value)
        if not count:
            return

        treemap_data = [['Location', 'Parent', 'Word Occurrences'],
            ['Shakespeare\'s Corpus', None, count]]

        word_db = Word.get_by_id(searched_value)
        for work in all_mentions:
            work_db = Work.get_by_id(work, parent=word_db.key)
            treemap_data.append([work, 'Shakespeare\'s Corpus', work_db.count]) 
            for char in all_mentions[work]:
                if not char:
                    continue
                char_db = Character.get_by_id(char, parent=work_db.key)
                treemap_data.append([{'v': work + '+' + char, 'f': char}, work, 
                    char_db.count])

        self.response.headers['Content-Type'] = 'text/json'
        self.response.out.write(json.encode({"array": treemap_data}))
def index_reduce(key, values):
    """Index reduce function.
    Args:
        key: a string in the format <word>_SEP<work>_SEP<character>
        values: the lines in which <word> appears in <work> in a speak of
            <character>

    The word is either added to the database or updated with its new occurence,
    adding info about the work in which it was found, which character pronounced
    it (if applicable), a count of occurrences and a reference to the line in
    which it was found.
    """
    keys = key.split(_SEP)
    word_value, work_value, char_value = keys
    word = Word.get_by_id(word_value)
    work_titlecase = titlecase(work_value)
    if not word:
        word = Word(id=word_value, name=word_value, count=len(values))
        work = Work(parent=word.key, id=work_titlecase,
                        title=work_titlecase, count=len(values))
    else:
        word.count += len(values)
        work = Work.get_by_id(work_titlecase, parent=word.key)
        if work:
            work.count += len(values)
        else:
            work = Work(parent=word.key, id=work_titlecase,
                title=work_titlecase, count=len(values))
    character_titlecase = titlecase(char_value)
    char = Character(parent=work.key, id=character_titlecase,
        name=character_titlecase, count= len(values))
    for line in set(values):
        char.mentions.append(pickle.loads(line))
    word.put()
    work.put()
    char.put()
def _get_all_word_mentions(word_name):
    """Get all the mentions of a certain word string representation accessed
       first by work and then by character.

    Args:
        word_name: the string representation of the word.

    Returns:
        A dictionary of dictionaries, being the first key the work title and the
        second, the character name.
    """
    all_mentions = {}
    word = Word.get_by_id(word_name)
    if not word:
        return {}, 0
    works = Work.query(ancestor=word.key)
    for work in works:
        work_chars = Character.query(ancestor=work.key)
        all_mentions[work.title] = {}
        for char in work_chars:
            mentions = char.get_string_mentions()
            bold_mentions = _bold_mentions(word.name, mentions)
            all_mentions[work.title][char.name] = bold_mentions
    return all_mentions, word.count
 def test_searching_a_non_existing_word(self):
     '''Ensure nothing fails if we search a word that doesn't exist.'''
     retrieved_word = Word.get_by_id("sdfgfdgdgf")   
     self.assertEqual(retrieved_word, None)