def test_get_epithet_of_author(self): """Test get_epithet_of_author().""" epithet = get_epithet_of_author('0016') self.assertEqual(epithet, 'Historici/-ae')
for file in files: filepath = os.path.join(user_dir, file) with open(filepath) as fo: #TODO rm words less the 3 chars long yield file[3:-4], fo.read() t0 = dt.datetime.utcnow() map_id_author = get_id_author() df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet']) for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'): author = map_id_author[_id] epithet = get_epithet_of_author(_id) df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True) print(df.shape) print('... finished in {}'.format(dt.datetime.utcnow() - t0)) print('Number of texts:', len(df)) text_list = df['text'].tolist() # make a list of short texts to drop # For pres, get distributions of words per doc short_text_drop_index = [index if len(text) > 500 else None for index, text in enumerate(text_list) ] # ~100 words t0 = dt.datetime.utcnow()
with open(filepath) as fo: #TODO rm words less the 3 chars long yield file[3:-4], fo.read() # In[3]: t0 = dt.datetime.utcnow() map_id_author = get_id_author() df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet']) for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'): author = map_id_author[_id] epithet = get_epithet_of_author(_id) df = df.append( { 'id': _id, 'author': author, 'text': text, 'epithet': epithet }, ignore_index=True) print(df.shape) print('... finished in {}'.format(dt.datetime.utcnow() - t0)) print('Number of texts:', len(df)) # In[4]:
def test_get_epithet_of_author(self): """Test get_epithet_of_author().""" epithet = get_epithet_of_author("0016") self.assertEqual(epithet, "Historici/-ae")