def singleprocess_datanum(self, test_dict): fu = file_util.FileUtil('../AmazonDataBackup/reviewsNew/reviewsNew.mP') fu.get_structure() for data_num in test_dict.keys(): print data_num content_list = fu.get_content_list()[0:data_num] content_list_2_grams = summary_plot.get_2_grams_list(content_list) start = time.time() jd_list = summary_plot.get_jd_list(content_list_2_grams) summary_plot.get_reviews_similarity_relation(jd_list) finish_time = time.time() - start # print 'finish get relation with %s s' % finish_time test_dict[data_num] = finish_time # test_dict = collections.OrderedDict(sorted(test_dict.items())) return test_dict
def draw_review_distance_multiprocess(list_num=-1, put_num=10000): q = Queue() l = Lock() fu = file_util.FileUtil() fu.open_file('../AmazonDataBackup/reviewsNew/reviewsNew.mP') # fu.open_file('../AmazonDataBackup/reviewsNew/reviewsNew103.mP') fu.get_structure() content_list = fu.get_content_list()[0:list_num] content_list_2_grams = summary_plot.get_2_grams_list(content_list) start = time.time() process_list = [] cpu_num = cpu_count() / 2 for i in range(0, cpu_num): p = Process(target=write_review_distance_to_file, args=(q, l, i), kwargs={'dirname': 'jaccard_distance_220000'}) p.start() process_list.append(p) reviews_len = len(content_list_2_grams) print reviews_len count = 0 grams_pair_list = [] for i in range(0, reviews_len): for j in range(i + 1, reviews_len): if count != 0: if count % put_num == 0: q.put(grams_pair_list) grams_pair_list = [] count = 0 if not q.empty() and count > put_num: print 'queue waiting', count time.sleep(0.1) grams_pair = [content_list_2_grams[i], content_list_2_grams[j]] grams_pair_list.append(grams_pair) count += 1 q.put(grams_pair_list) for i in range(0, cpu_num): q.put('STOP') for p in process_list: p.join() finish_time = time.time() - start print 'exit main with %s s' % finish_time return finish_time
def draw_reviewer_similarity_multiprocess(): q = Queue() l = Lock() start = time.time() fu = file_util.FileUtil() fu.open_file('../AmazonDataBackup/reviewsNew.txt') fu.get_structure() print 'finish get_structure() with %s s' % (time.time() - start) # reviewer_content_dict = fu.get_reviewer_content_dict() process_list = [] # producer = Process(target=producer, args=(q, l, 'producer', reviewer_content_dict)) # p.start() for i in range(0, cpu_count() / 2): p = Process(target=write_reviewer_similarity_to_file, args=(q, l, i)) p.start() process_list.append(p) count = 0 reviewer_content_dict = {} for line in fu.structure: reviewer = line[0] if not reviewer in reviewer_content_dict.keys(): if count % 1000 == 0 and count > 0: q.put(reviewer_content_dict) reviewer_content_dict = {} if not q.empty(): time.sleep(0.01) reviewer_content_dict[reviewer] = [] count += 1 reviewer_content_dict[reviewer].append(line[-1]) q.put(reviewer_content_dict) for i in range(0, cpu_count() / 2): q.put('STOP') print 'finish puting with %s s' % (time.time() - start) for p in process_list: p.join() finish_time = time.time() - start print 'exit main with %s s' % finish_time
global content_2_grams_list while not exitFlag: if not workQueue.empty(): # data = q.get() content_list = q.get() content_2_grams_list = content_2_grams_list + [summary_plot.get_2_grams(content) for content in content_list] print "%s processing %s" % (threadName, len(content_2_grams_list)) queueLock.release() else: queueLock.release() def compute_jaccard_distance(threadName, q): while not exitFlag: fu = file_util.FileUtil() fu.open_file('../AmazonDataBackup/reviewsNew/reviewsNew.mP') fu.get_structure() content_list = fu.get_content_list()[0:100] threadList = ["Thread-1", "Thread-2", "Thread-3"] # nameList = ["One", "Two", "Three", "Four", "Five"] queueLock = threading.Lock() workQueue = Queue.Queue(10) threads = [] threadID = 1 # 创建新线程 for tName in threadList: thread = myThread(threadID, tName, workQueue) thread.start()