fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum])))) fout.close() print('Preprocess finished ...') # dssm_corp_input = dstdir + 'corpus_preprocessed.txt' # dssm_corp_output = dstdir + 'corpus_preprocessed_dssm.txt' word_dict_input = dstdir + 'word_dict.txt' triletter_dict_output = dstdir + 'triletter_dict.txt' word_triletter_output = dstdir + 'word_triletter_map.txt' word_dict = read_dict(word_dict_input) word_triletter_map = {} triletter_stats = {} for wid, word in word_dict.items(): nword = '#' + word + '#' ngrams = NgramUtil.ngrams(list(nword), 3, '') word_triletter_map[wid] = [] for tric in ngrams: if tric not in triletter_stats: triletter_stats[tric] = 0 triletter_stats[tric] += 1 word_triletter_map[wid].append(tric) triletter_dict = filter_triletter(triletter_stats, 5, 10000) with open(triletter_dict_output, 'w') as f: for tri_id, tric in triletter_dict.items(): print(tri_id, tric, file=f) with open(word_triletter_output, 'w') as f: for wid, trics in word_triletter_map.items(): print(wid, ' '.join([ str(triletter_dict[k]) for k in trics
for inum, did in enumerate(dids): fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum])))) fout.close() print('Preprocess finished ...') # dssm_corp_input = dstdir + 'corpus_preprocessed.txt' # dssm_corp_output = dstdir + 'corpus_preprocessed_dssm.txt' word_dict_input = dstdir + 'word_dict.txt' triletter_dict_output = dstdir + 'triletter_dict.txt' word_triletter_output = dstdir + 'word_triletter_map.txt' word_dict = read_dict(word_dict_input) word_triletter_map = {} triletter_stats = {} for wid, word in word_dict.items(): nword = '#' + word + '#' ngrams = NgramUtil.ngrams(list(nword), 3, '') word_triletter_map[wid] = [] for tric in ngrams: if tric not in triletter_stats: triletter_stats[tric] = 0 triletter_stats[tric] += 1 word_triletter_map[wid].append(tric) triletter_dict = filter_triletter(triletter_stats, 5, 10000) with open(triletter_dict_output, 'w') as f: for tri_id, tric in triletter_dict.items(): print(f, tri_id, tric, file=f) with open(word_triletter_output, 'w') as f: for wid, trics in word_triletter_map.items(): print(wid, ' '.join([str(triletter_dict[k]) for k in trics if k in triletter_dict]), file=f) print('Triletter Processing finished ...')