def test_textframe(): docx = TextFrame() docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com ๐." result = docx.word_tokens() assert result == [ 'This', 'is', 'the', 'mail', 'examplegmailcom', 'our', 'WEBSITE', 'is', 'httpsexamplecom', '๐' ]
def read_txt(filename): """ Read a Text File and Create A TextFrame From it Parameters ---------- text : Main Text filename : file with text to read Returns ---------- Returns a TextFrame for text """ with open(filename, "r") as f: text_read = f.read() docx_tf = TextFrame(text_read) return docx_tf
def test_textframe_remove_shortwords(): docx = TextFrame() docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com ๐." result = docx.remove_shortwords(length=3) assert result.text == "This mail example gmail WEBSITE https example"
def test_textframe_remove_userhandles(): docx = TextFrame() docx.text = "This is the tag @jesuslives use wisely " result = docx.remove_userhandles() assert result.text == "This is the tag use wisely "
def test_textframe_remove_puncts(): docx = TextFrame() docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com ๐." result = docx.remove_puncts() assert result.text == "This is the mail example@gmailcom our WEBSITE is https://examplecom ๐"
def test_textframe_remove_stopwords(): docx = TextFrame() docx.text = "This is the mail [email protected] ,our WEBSITE is https://example.com ๐." result = docx.remove_stopwords(lang='en') assert result.text == "mail [email protected] ,our WEBSITE https://example.com ๐."
def test_textframe_remove_html(): docx = TextFrame() docx.text = "This is the <h2>example for html tags</h2>" result = docx.remove_html_tags() assert result.text == "This is the example for html tags"