def parse(self): for file_name in os.listdir(self.data_dir): data_file = open(self.data_dir + file_name, "r") title = "" while not title: title = data_file.readline().strip() title = " ".join(word[0].upper() + word[1:].lower() for word in title.split()) data_file.seek(0) for line in data_file: line = line.strip() if line: words = set(map(string.lower, re.sub("[^\w]", " ", line).split())) for word in words: word_data = Word.get_by_id(word, parent=self.parent) if not word_data: word_data = Word(parent=self.parent, id=word, name=word) new_mention = Mention(line=line, work=title) if word_data.mentions: word_data.mentions.append(new_mention) else: word_data.mentions = [new_mention] word_data.put()
def index_reduce(key, values): """Index reduce function. Args: key: a string in the format <word>_SEP<work>_SEP<character> values: the lines in which <word> appears in <work> in a speak of <character> The word is either added to the database or updated with its new occurence, adding info about the work in which it was found, which character pronounced it (if applicable), a count of occurrences and a reference to the line in which it was found. """ keys = key.split(_SEP) word_value, work_value, char_value = keys word = Word.get_by_id(word_value) work_titlecase = titlecase(work_value) if not word: word = Word(id=word_value, name=word_value, count=len(values)) work = Work(parent=word.key, id=work_titlecase, title=work_titlecase, count=len(values)) else: word.count += len(values) work = Work.get_by_id(work_titlecase, parent=word.key) if work: work.count += len(values) else: work = Work(parent=word.key, id=work_titlecase, title=work_titlecase, count=len(values)) character_titlecase = titlecase(char_value) char = Character(parent=work.key, id=character_titlecase, name=character_titlecase, count= len(values)) for line in set(values): char.mentions.append(pickle.loads(line)) word.put() work.put() char.put()
def add_word(request): word = Word(word=request.new_word, word_id=Word.allocate_ids(1)[0]) word.put() return word.word
class DatastoreTest(unittest.TestCase): def setUp(self): ''' Creates an instance of Testbed class and initializes it with the datastore stub. Also creates the entities and stores them in the database.''' self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.word = Word(id="death", name="death", count=2) self.work = Work( parent=self.word.key, id="Hamlet", title="Hamlet", count=1) self.character = Character( parent=self.work.key, id="Claudius", name="Claudius", count=1) line = Line(line='Though yet of Hamlet our dear brother\'s death').put() self.character.mentions = [line] self.word_key = self.word.put() self.work_key = self.work.put() self.character_key = self.character.put() def tearDown(self): '''Deactivate the testbed. This restores the original stubs so that tests do not interfere with each other.''' self.word_key.delete() self.work_key.delete() self.character_key.delete() self.testbed.deactivate() def test_insert_entities(self): '''Ensures that the entities are saved in the database. If we can retrieved they are correctly stored.''' retrieved_word = self.word_key.get() self.assertEqual(2, retrieved_word.count) self.assertEqual(2, retrieved_word.count) retrieved_work = self.work_key.get() self.assertEqual('Hamlet', retrieved_work.title) retrieved_character = self.character_key.get() self.assertEqual('Claudius', retrieved_character.name) self.assertEqual(1, len(retrieved_character.mentions)) self.assertEqual('Though yet of Hamlet our dear brother\'s death', retrieved_character.mentions[0].get().line) def test_searching_a_non_existing_word(self): '''Ensure nothing fails if we search a word that doesn't exist.''' retrieved_word = Word.get_by_id("sdfgfdgdgf") self.assertEqual(retrieved_word, None) def test_filter_entities_using_query_works(self): '''We can search for all the entities starting from a word.''' retrieved_word = Word.get_by_id("death") self.assertEqual('death', retrieved_word.name) self.assertEqual(2, retrieved_word.count) retrieved_works = Work.query(ancestor=self.word.key).fetch() self.assertEqual(len(retrieved_works), 1) work = retrieved_works[0] retrieved_character = Character.query(ancestor=work.key).fetch() self.assertEqual(len(retrieved_character), 1) char = retrieved_character[0] self.assertEqual(1, len(char.mentions)) self.assertEqual("Though yet of Hamlet our dear brother's death", char.mentions[0].get().line)