def phrase_detection(bi_gram, file_name): lines = [line for line in StoreHelper.read_file(file_name).splitlines()] result = [] for line in lines: for y in SegmentHelper.lemmatization(SegmentHelper.segment_text(line)): if len(y) > 0: result.append(y) return bi_gram[result]
def generate_sentence_stream(): sentence_stream = [] for i in range(8535): #8535 text_file = "../data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) file_content = StoreHelper.read_file(text_file) for line in file_content.splitlines(): sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) StoreHelper.store_data(sentence_stream, 'sentence_stream.dat') return sentence_stream
def run_lemmatize(src_folder, dst_folder): for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) output_file = path.join(dst_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content = StoreHelper.read_file(input_file) new_content = [ SegmentHelper.normalize(line) for line in file_content.splitlines() ] StoreHelper.save_file(os.linesep.join(new_content), output_file) else: print("%s not exist!" % input_file)