def jieba_seg(filepath): cnt = Counter() cnt['line'] = 0 segmentor = segment() with open(filepath + '.seg', 'wb') as outputfile: writer = unicodecsv.writer(outputfile, delimiter='\t', encoding='utf-8') with open(filepath) as inputfile: cnt['line'] += 1 logging.info('line count') for transaction in csv.reader(inputfile, delimiter='\t'): assert len(transaction) == 1, "\n%s" % (str(transaction)) cleanedstr = string_process.remove_characters(unicode(transaction[0], 'utf-8')) segmentres = segmentor.jieba_segment(cleanedstr) cleanedres = string_process.remove_invalid_string(segmentres) writer.writerow(cleanedres)
def convert_to_str(string): segvec = segmentor.jieba_segment(string) cleanedvec = string_process.remove_invalid_string(segvec) sortedvec = sorted(cleanedvec) return (u'|'.join(sortedvec))