def test_preprocessing(self):
     """testing end to end preprocessing"""
     for test_case in getattr(TestCasesGenerator, 'preprocessing'):
         tweet = TwitterPreprocessor(test_case['case'])
         tweet.preprocessing()
         result = tweet.text
         expected_result = test_case['expected']
         self.assertEqual(result, expected_result)
 def test_replace_token_with_index(self):
     """testing indexing"""
     for test_case in getattr(TestCasesGenerator, 'index'):
         tweet = TwitterPreprocessor(test_case['case'])
         tweet.replace_token_with_index()
         result = tweet.text
         expected_result = test_case['expected']
         self.assertEqual(result, expected_result)
 def test_pad_sequence(self):
     """testing pad sequence"""
     for test_case in getattr(TestCasesGenerator, 'padding'):
         tweet = TwitterPreprocessor(test_case['case'])
         tweet.pad_sequence()
         result = tweet.text
         expected_result = test_case['expected']
         self.assertEqual(result, expected_result)
 def test_tokenize_text(self):
     """testing tokenizing"""
     for test_case in getattr(TestCasesGenerator, 'tokenize'):
         tweet = TwitterPreprocessor(test_case['case'])
         tweet.tokenize_text()
         result = tweet.text
         expected_result = test_case['expected']
         self.assertEqual(result, expected_result)
def TextPreprocessing(data,text_processor,sentence_segmentation=False,pre_lang_check=True,mode=0):
	global MIN_TOKENS
	sents = [fix_text(sent) for sent in data]
	if pre_lang_check:
		sents = [sent for sent in sents if is_english(sent)]
	if sentence_segmentation:
		sents = get_sentences(sents)
	sents = [remove_emoticon(" ".join(remove_slang(sent.split()))) for sent in sents]
	sents = [TwitterPreprocessor(" ".join(sent)).remove_blank_spaces().text for sent in list(text_processor.pre_process_docs(sents))]
	sents = [sent.replace("<censored>","").replace("<emphasis>","").replace("<elongated>","").replace("<repeated>","") for sent in sents]
	if mode:
		MIN_TOKENS*=2
	sents = [sent.strip() for sent in sents if len(sent.split())>MIN_TOKENS and check_en_lang(sent)]
	return sents
Example #6
0
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])

#export to csv
combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig')
print("finish merging csv files")

df = pd.read_csv("combined_csv.csv")

print("total number of twitter :", len(df))
txt = []

for tweet in df.text:
    try:
        t = tweet
        p = TwitterPreprocessor(t)
        p.fully_preprocess()

        txt.append(p.text)
    except:
        print("skip blank")

with open('training_data.txt', 'w') as f:
    for item in txt:
        f.write("%s\n" % item)

print("finish preprocessing text data (train_data.txt)")
 def _generate_tests(self, module: str, method: str):
     for test_case in getattr(TestCasesGenerator, module):
         text = getattr(TwitterPreprocessor(text=test_case['case']),
                        method)().text
         self.assertEqual(test_case['expected'], text)
Example #8
0
	corrector="twitter",

	unpack_hashtags=True,  # perform word segmentation on hashtags
	unpack_contractions=True,  # Unpack contractions (can't -> can not)
	spell_correct_elong=False,  # spell correction for elongated words
	spell_correction=False,
	# select a tokenizer. You can use SocialTokenizer, or pass your own
	# the tokenizer, should take as input a string and return a list of tokens
	tokenizer=SocialTokenizer(lowercase=True).tokenize,

	# list of dictionaries, for replacing tokens extracted from the text,
	# with other expressions. You can pass more than one dictionaries.
)
def remove_tags(doc):
	"""
	Remove tags from sentence
	"""
	doc = ' '.join(word for word in doc.split() if word[0]!='<')
	return doc

def remove_slang(sent):
	return [slangdict[w.strip()] if w.strip() in slangdict else w for w in sent]

sents = [remove_emoticon(" ".join(remove_slang(sent.split()))) for sent in sentences]
tokenized_sentences = list(text_processor.pre_process_docs(sents))
for x in range(len(tokenized_sentences)):
	sent = " ".join(tokenized_sentences[x])
	sent = TwitterPreprocessor(sent).remove_blank_spaces().text
	print(sentences[x])
	print(sent,"\n\n")