def clean_play_store_data(file_path): """TODO""" number_of_apps = 0 data = {} with open(file_path) as stream: reader = csv.reader(stream) next(reader) start_time = time.time() for row in reader: number_of_apps += 1 app_id = row[0] text = row[1] data[app_id] = [] for sentence in NLPUtils.sentence_tokenization(text): data[app_id].append(sentence) return data
def process_raw_dataset(file_path, out_file): """TODO""" number_of_apps = 0 with open(file_path) as stream: with open(out_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) reader = csv.reader(stream) header = next(reader) writer.writerow(header) start_time = time.time() for row in reader: if number_of_apps % 100 == 0: elapsed_time = time.time() - start_time print("Number of apps processed is {}".format(number_of_apps)) print("Elapsed time up to now is {}".format(elapsed_time)) number_of_apps += 1 text = row[1] try: sentences = [] if langdetect.detect(text) == u'en': for sentence in NLPUtils.sentence_tokenization(text): sentence = NLPUtils.remove_hyperlinks(sentence) sentence = sentence.lower() if sentence: tokens = NLPUtils.word_tokenization(sentence) tokens = [NLPUtils.punctuation_removal(token) for token in tokens] tokens = NLPUtils.stopword_elimination(tokens) tokens = NLPUtils.nonalpha_removal(tokens) if tokens: sentence = " ".join(tokens) sentence = sentence.rstrip() if sentence != "": sentences.append(sentence.rstrip()) if sentences: writer.writerow([NLPUtils.punctuation_removal(row[0]), "%%".join(sentences), "%%".join(row[2].split(",")), row[3]]) except Exception: pass