voc[w] = voc[w] + 1.0 # body_wlist = word_tokenize(q.body.strip()) # for w in body_wlist: # if w not in cur_word_set: # cur_word_set.add(w) # if w not in voc.keys(): # voc[w] = 1.0 # else: # voc[w] = voc[w] + 1.0 count += 1 if count % 10000 == 0: print 'processing %s unit...' % count, get_current_time() for key in voc.keys(): idf = math.log(total_num / (voc[key] + 1.0)) voc[key] = idf sorted_voc = sorted(voc.items(), key=operator.itemgetter(1)) return sorted_voc if __name__ == '__main__': fpath = 'idf_vocab.csv' header = ['word', 'idf'] vocab = build_IDF_vocabulary() write_list_to_csv(vocab, fpath, header) print 'Done.'
# processed_sentence.append(sentence1) # summary = '\n'.join([x.capitalize() for x in selected_sentence]) return selected_sentence, rank_list def load_ss_result(ss_fpath): import pandas as pd ss_res = list() df = pd.read_csv(ss_fpath) for idx, row in df.iterrows(): ss_res.append((row[0], eval(row[1]))) return ss_res if __name__ == '__main__': ss_fpath = os.path.join(res_dir, 'ss_res.csv') topk = 5 res = list() # print load_ss_result(ss_fpath) for query, ss in load_ss_result(ss_fpath): query = ' '.join(preprocessing_for_query(query)) sum = get_summary(query, ss, topk) res.append([query, sum]) print("summary\n%s" % sum) res_fpath = os.path.join(res_dir, 'summary_res.csv') header = ["query", "summary"] write_list_to_csv(res, res_fpath, header)
try: cur.execute(sql) results = cur.fetchall() for row in results: postId = row[2] related_postId = row[3] if postId in java_id_set and related_postId in java_id_set: if postId not in id_dict: id_dict[postId] = True if related_postId not in id_dict: id_dict[related_postId] = True cnt += 1 if cnt % 10000 == 0: print('Processing %s...' % cnt, get_current_time()) except Exception as e: print e cur.close() con.close() print("# relevant qid = %s" % len(id_dict), get_current_time()) return sorted(list(id_dict.keys())) if __name__ == '__main__': java_qid_set_fpath = 'java_qid_list.csv' java_id_set = load_java_qid_set(java_qid_set_fpath) related_id_list = extract_java_relevant_ids_from_postlink(java_id_set) # post id list related_id_list_fpath = 'related_qid_list.txt' header = ['Id'] write_list_to_csv(related_id_list, related_id_list_fpath, header)