def test_get_id_author(self): """Test get_id_author().""" self.assertEqual(type(get_id_author()), dict)
def test_get_id_author(self): """Test get_id_author().""" self.assertEqual(type(get_id_author()), dict)
def stream_lemmatized_files(corpus_dir): # return all docs in a dir user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir) files = os.listdir(user_dir) for file in files: filepath = os.path.join(user_dir, file) with open(filepath) as fo: #TODO rm words less the 3 chars long yield file[3:-4], fo.read() t0 = dt.datetime.utcnow() map_id_author = get_id_author() df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet']) for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'): author = map_id_author[_id] epithet = get_epithet_of_author(_id) df = df.append({'id': _id, 'author': author, 'text': text, 'epithet': epithet}, ignore_index=True) print(df.shape) print('... finished in {}'.format(dt.datetime.utcnow() - t0)) print('Number of texts:', len(df)) text_list = df['text'].tolist()
# return all docs in a dir user_dir = os.path.expanduser('~/cltk_data/user_data/' + corpus_dir) files = os.listdir(user_dir) for file in files: filepath = os.path.join(user_dir, file) with open(filepath) as fo: #TODO rm words less the 3 chars long yield file[3:-4], fo.read() # In[3]: t0 = dt.datetime.utcnow() map_id_author = get_id_author() df = pandas.DataFrame(columns=['id', 'author' 'text', 'epithet']) for _id, text in stream_lemmatized_files('tlg_lemmatized_no_accents_no_stops'): author = map_id_author[_id] epithet = get_epithet_of_author(_id) df = df.append( { 'id': _id, 'author': author, 'text': text, 'epithet': epithet }, ignore_index=True)