def test_preprocessing(self): """testing end to end preprocessing""" for test_case in getattr(TestCasesGenerator, 'preprocessing'): tweet = TwitterPreprocessor(test_case['case']) tweet.preprocessing() result = tweet.text expected_result = test_case['expected'] self.assertEqual(result, expected_result)
def test_replace_token_with_index(self): """testing indexing""" for test_case in getattr(TestCasesGenerator, 'index'): tweet = TwitterPreprocessor(test_case['case']) tweet.replace_token_with_index() result = tweet.text expected_result = test_case['expected'] self.assertEqual(result, expected_result)
def test_pad_sequence(self): """testing pad sequence""" for test_case in getattr(TestCasesGenerator, 'padding'): tweet = TwitterPreprocessor(test_case['case']) tweet.pad_sequence() result = tweet.text expected_result = test_case['expected'] self.assertEqual(result, expected_result)
def test_tokenize_text(self): """testing tokenizing""" for test_case in getattr(TestCasesGenerator, 'tokenize'): tweet = TwitterPreprocessor(test_case['case']) tweet.tokenize_text() result = tweet.text expected_result = test_case['expected'] self.assertEqual(result, expected_result)
def TextPreprocessing(data,text_processor,sentence_segmentation=False,pre_lang_check=True,mode=0): global MIN_TOKENS sents = [fix_text(sent) for sent in data] if pre_lang_check: sents = [sent for sent in sents if is_english(sent)] if sentence_segmentation: sents = get_sentences(sents) sents = [remove_emoticon(" ".join(remove_slang(sent.split()))) for sent in sents] sents = [TwitterPreprocessor(" ".join(sent)).remove_blank_spaces().text for sent in list(text_processor.pre_process_docs(sents))] sents = [sent.replace("<censored>","").replace("<emphasis>","").replace("<elongated>","").replace("<repeated>","") for sent in sents] if mode: MIN_TOKENS*=2 sents = [sent.strip() for sent in sents if len(sent.split())>MIN_TOKENS and check_en_lang(sent)] return sents
extension = 'csv' all_filenames = [i for i in glob.glob('*.{}'.format(extension))] #combine all files in the list combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames]) #export to csv combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig') print("finish merging csv files") df = pd.read_csv("combined_csv.csv") print("total number of twitter :", len(df)) txt = [] for tweet in df.text: try: t = tweet p = TwitterPreprocessor(t) p.fully_preprocess() txt.append(p.text) except: print("skip blank") with open('training_data.txt', 'w') as f: for item in txt: f.write("%s\n" % item) print("finish preprocessing text data (train_data.txt)")
def _generate_tests(self, module: str, method: str): for test_case in getattr(TestCasesGenerator, module): text = getattr(TwitterPreprocessor(text=test_case['case']), method)().text self.assertEqual(test_case['expected'], text)
corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words spell_correction=False, # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. ) def remove_tags(doc): """ Remove tags from sentence """ doc = ' '.join(word for word in doc.split() if word[0]!='<') return doc def remove_slang(sent): return [slangdict[w.strip()] if w.strip() in slangdict else w for w in sent] sents = [remove_emoticon(" ".join(remove_slang(sent.split()))) for sent in sentences] tokenized_sentences = list(text_processor.pre_process_docs(sents)) for x in range(len(tokenized_sentences)): sent = " ".join(tokenized_sentences[x]) sent = TwitterPreprocessor(sent).remove_blank_spaces().text print(sentences[x]) print(sent,"\n\n")