def save_labels(self): review_txt = "" content_list = self.fu.get_content_list() print 'get content list' grams_list = [] for content in content_list: grams_list.append(get_2_grams(content)) print 'get grams list' label_list = [] content_len = len(content_list) for x in xrange(0,content_len): label_list.append(0) print 'start labeling' for i in xrange(0,content_len): grams_a = grams_list[i] for j in xrange(i+1,content_len): grams_b = grams_list[j] sim = jaccard_distance(grams_a, grams_b) if sim >= 0.9: print "sim is : " , sim label_list[i] = 1 label_list[j] = 1 with open(self.old_file + '36') as fp: lines = fp.readlines() for index, line in enumerate(lines): product_id = product_list[index] review_txt += lines[index].replace('\n', '') + '\t' + str(label_list[index]) +'\n' with open(self.new_file + '37', 'w') as fp: fp.write(review_txt)
def write_review_distance_to_file(q, l, name, dirname='jaccard_distance'): """ 多进程方法 获取一个grams_pair的数组 计算数组中每个元素的distance 然后存入自己的dis_list中 在任务完成之后,写入自己的进程对应的文件中 """ print 'starting process %s' % name # dis_list = [] if not path.exists(dirname): makedirs(dirname) count = 0 while True: dis_list = [] l.acquire() if q.empty(): l.release() time.sleep(0.01) continue else: grams_pair_list = q.get() if grams_pair_list == 'STOP': print 'process', name, ' exit' l.release() break l.release() print 'process', name, 'have got ', len(grams_pair_list), 'reviews' # dis_list = [] with open(dirname + '/jd.' + str(name) + '_' + str(count), 'w') as fp: # fp_list = ast.literal_eval(fp_list) for grams in grams_pair_list: jaccard_distance = summary_plot.jaccard_distance( grams[0], grams[1]) dis_list.append(jaccard_distance) # print name, len(total_list) fp.write(str(dis_list)) count += 1
def write_review_distance_to_file(q, l, name, dirname="jaccard_distance"): """ 多进程方法 获取一个grams_pair的数组 计算数组中每个元素的distance 然后存入自己的dis_list中 在任务完成之后,写入自己的进程对应的文件中 """ print "starting process %s" % name # dis_list = [] if not path.exists(dirname): makedirs(dirname) count = 0 while True: dis_list = [] l.acquire() if q.empty(): l.release() time.sleep(0.01) continue else: grams_pair_list = q.get() if grams_pair_list == "STOP": print "process", name, " exit" l.release() break l.release() print "process", name, "have got ", len(grams_pair_list), "reviews" # dis_list = [] with open(dirname + "/jd." + str(name) + "_" + str(count), "w") as fp: # fp_list = ast.literal_eval(fp_list) for grams in grams_pair_list: jaccard_distance = summary_plot.jaccard_distance(grams[0], grams[1]) dis_list.append(jaccard_distance) # print name, len(total_list) fp.write(str(dis_list)) count += 1