def phrase_detection(bi_gram, file_name): lines = [line for line in StoreHelper.read_file(file_name).splitlines()] result = [] for line in lines: for y in SegmentHelper.lemmatization(SegmentHelper.segment_text(line)): if len(y) > 0: result.append(y) return bi_gram[result]
def generate_sentence_stream(): sentence_stream = [] for i in range(8535): #8535 text_file = "../data/clean_post_lemmatize/%04d.dat" % i if StoreHelper.is_file_exist(text_file): print ("Working on %s" % text_file) file_content = StoreHelper.read_file(text_file) for line in file_content.splitlines(): sentence_stream.append(SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) StoreHelper.store_data(sentence_stream, 'sentence_stream.dat') return sentence_stream
def get_frequency_dict(content): words_list = [] for line in content.splitlines(): words_list.extend( SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) return DictHelper.dict_from_count_list(words_list)
def generate_word_list(self): words_list = [] for line in self.raw_position.splitlines(): words_list.extend( SegmentHelper.lemmatization(SegmentHelper.segment_text(line))) return words_list