Example #1
0
    def parse(self):
        for file_name in os.listdir(self.data_dir):
            data_file = open(self.data_dir + file_name, "r")

            title = ""
            while not title:
                title = data_file.readline().strip()
                title = " ".join(word[0].upper() + word[1:].lower() for word in title.split())

            data_file.seek(0)
            for line in data_file:
                line = line.strip()
                if line:
                    words = set(map(string.lower, re.sub("[^\w]", " ", line).split()))
                    for word in words:
                        word_data = Word.get_by_id(word, parent=self.parent)

                        if not word_data:
                            word_data = Word(parent=self.parent, id=word, name=word)
                        new_mention = Mention(line=line, work=title)
                        if word_data.mentions:
                            word_data.mentions.append(new_mention)
                        else:
                            word_data.mentions = [new_mention]
                        word_data.put()
def index_reduce(key, values):
    """Index reduce function.
    Args:
        key: a string in the format <word>_SEP<work>_SEP<character>
        values: the lines in which <word> appears in <work> in a speak of
            <character>

    The word is either added to the database or updated with its new occurence,
    adding info about the work in which it was found, which character pronounced
    it (if applicable), a count of occurrences and a reference to the line in
    which it was found.
    """
    keys = key.split(_SEP)
    word_value, work_value, char_value = keys
    word = Word.get_by_id(word_value)
    work_titlecase = titlecase(work_value)
    if not word:
        word = Word(id=word_value, name=word_value, count=len(values))
        work = Work(parent=word.key, id=work_titlecase,
                        title=work_titlecase, count=len(values))
    else:
        word.count += len(values)
        work = Work.get_by_id(work_titlecase, parent=word.key)
        if work:
            work.count += len(values)
        else:
            work = Work(parent=word.key, id=work_titlecase,
                title=work_titlecase, count=len(values))
    character_titlecase = titlecase(char_value)
    char = Character(parent=work.key, id=character_titlecase,
        name=character_titlecase, count= len(values))
    for line in set(values):
        char.mentions.append(pickle.loads(line))
    word.put()
    work.put()
    char.put()
Example #3
0
def add_word(request):
    word = Word(word=request.new_word, word_id=Word.allocate_ids(1)[0])
    word.put()
    return word.word
class DatastoreTest(unittest.TestCase):
    def setUp(self):
        ''' Creates an instance of Testbed class and initializes it with the 
        datastore stub.

        Also creates the entities and stores them in the database.'''
        self.testbed = testbed.Testbed()
        self.testbed.activate()
        self.testbed.init_datastore_v3_stub()

        self.word = Word(id="death", name="death", count=2)
        self.work = Work(
            parent=self.word.key, id="Hamlet", title="Hamlet", count=1)
        self.character = Character(
            parent=self.work.key, id="Claudius", name="Claudius", count=1)
        line = Line(line='Though yet of Hamlet our dear brother\'s death').put()

        self.character.mentions = [line]

        self.word_key = self.word.put()
        self.work_key = self.work.put()
        self.character_key = self.character.put()

    def tearDown(self):
        '''Deactivate the testbed. 
        This restores the original stubs so that tests do not interfere with 
        each other.'''

        self.word_key.delete()
        self.work_key.delete()
        self.character_key.delete()

        self.testbed.deactivate()

    def test_insert_entities(self):
        '''Ensures that the entities are saved in the database.

        If we can retrieved they are correctly stored.'''
        retrieved_word = self.word_key.get()
        self.assertEqual(2, retrieved_word.count)
        self.assertEqual(2, retrieved_word.count)

        retrieved_work = self.work_key.get()
        self.assertEqual('Hamlet', retrieved_work.title)

        retrieved_character = self.character_key.get()
        self.assertEqual('Claudius', retrieved_character.name)
        self.assertEqual(1, len(retrieved_character.mentions))
        self.assertEqual('Though yet of Hamlet our dear brother\'s death', 
            retrieved_character.mentions[0].get().line)

    def test_searching_a_non_existing_word(self):
        '''Ensure nothing fails if we search a word that doesn't exist.'''
        retrieved_word = Word.get_by_id("sdfgfdgdgf")   
        self.assertEqual(retrieved_word, None)   

    def test_filter_entities_using_query_works(self):
        '''We can search for all the entities starting from a word.'''
        retrieved_word = Word.get_by_id("death")  
        self.assertEqual('death', retrieved_word.name)
        self.assertEqual(2, retrieved_word.count)

        retrieved_works = Work.query(ancestor=self.word.key).fetch()
        self.assertEqual(len(retrieved_works), 1)
        work = retrieved_works[0]

        retrieved_character = Character.query(ancestor=work.key).fetch()
        self.assertEqual(len(retrieved_character), 1)
        char = retrieved_character[0]
        self.assertEqual(1, len(char.mentions))
        self.assertEqual("Though yet of Hamlet our dear brother's death", 
            char.mentions[0].get().line)