Example #1
0
files = [f for f in os.listdir("./processed")]

for x in range(0,len(files)):
	if x > 1:
		print("completed: ", round((x * 100) / len(files),1), "%           ", end='\r')
	file_name =  os.path.join("./processed", files[x])
	try:
		df = pd.read_csv(filepath_or_buffer = file_name, index_col=0, dtype = str, na_filter=False)
		# df = df.drop(["id_df"], axis=1)

		n_starting_triplets += len(df)
		cleaner = Cleaner(df, t5_tokenizer, stopwords, english_cache)

		cleaner.remove_non_marked()
		cleaner.clean_df()

		# final cleaning : remove methods which has more than one review
		cleaner.remove_multiple_method_comments()

		n_irrelevant_comments += cleaner.irrelevant_comments
		n_not_marked += cleaner.not_marked
		n_non_latin += cleaner.non_latin
		n_before_equals_after += cleaner.before_equals_after
		n_non_english += cleaner.non_english
		n_too_long += cleaner.too_long
		n_too_long_after += cleaner.too_long_after
		n_multiple_rev += cleaner.multiple_reviews

		n_comment_empty += cleaner.comment_empty
		n_code_before_empty += cleaner.code_before_empty