def sentences_producer(queue, data): """ puts the list of sentences or a single sentence in the queue generated by the research paper :param queue: queue object :param data: data generator object :return: """ count = 0 start_time = time() p = current_process() logger.info('Running process: {} with pid: {}'.format(p.name, p.pid)) for datum in data.get_datum(): count += 1 if count % 1000 == 0: logger.info( 'Approx: {} files have been processed so far'.format(count)) rp = ResearchPaper(r_paper=datum) sentences = rp.get_r_paper_title() uid = rp.get_r_paper_id() if sentences is not None and len(sentences) > 0: # tuple with first term as uid and second term as sentences queue.put((uid, sentences)) end_time = time() logger.info('Exiting process {} with pid: {}'.format(p.name, p.pid)) logger.info('Process: {} with pid: {} ran for {} seconds'.format( p.name, p.pid, end_time - start_time))
def generate_interim_vocabulary(word_queue, result_queue): in_queue = word_queue buffer = Buffer(max_size=1000) p = current_process() start_time = time() logger.info('Running process: {} with pid: {}'.format(p.name, p.pid)) try: while True: words = in_queue.get(timeout=100) if words is None or len(words) == 0: continue else: words_set = set(words) try: buffer.add(words_set) except OverflowError: tmp_array = list() for item in buffer.buffer_data: tmp_array.extend(item) buffer.clear() results = list(set(tmp_array)) logger.info( 'Writing {} results to results queue for Process: {} with pid: {}' .format(len(results), p.name, p.pid)) result_queue.put(results) # this block of code is executed if in_queue is empty for 10 seconds except Empty: tmp_array = list() if buffer.size() > 0: for item in buffer.buffer_data: tmp_array.extend(item) buffer.clear() logger.info( "No data found in the words_queue for last 100 seconds") results = list(set(tmp_array)) logger.info( "Emptying left over results in the buffer of size {}".format( len(results))) result_queue.put(results) end_time = time() logger.info('Process: {} with pid: {} ran for {} seconds'.format( p.name, p.pid, end_time - start_time))
def corpus_to_words_producer(corp_queue, word_queue): in_queue = corp_queue out_queue = word_queue p = current_process() start_time = time() logger.info('Running process: {} with pid: {}'.format(p.name, p.pid)) try: while True: corpus = in_queue.get(timeout=60) unique_words = DataPreProcessing.unique_words(document=corpus) filtered_words = DataPreProcessing.remove_hyperlinks(unique_words) processed_words = DataPreProcessing.remove_special_chars( filtered_words) filtered_words = DataPreProcessing.remove_numbers(processed_words) filtered_words = DataPreProcessing.remove_by_length(filtered_words, length=2) filtered_words = DataPreProcessing.remove_stop_words( words=filtered_words) stemmed_words = DataPreProcessing.stemmer(filtered_words) out_queue.put(stemmed_words) # logger.info('Placed words of size: {} in words queue'.format(len(words))) except Empty: logger.info( 'For Process: {} with pid: {} No data found in the corpus queue for the last 60 seconds, ' 'preparing to terminate'.format(p.name, p.pid)) end_time = time() logger.info('Process: {} with pid: {} ran for {} seconds'.format( p.name, p.pid, end_time - start_time))
def corpus_to_words_producer(corp_queue, word_queue): in_queue = corp_queue out_queue = word_queue p = current_process() start_time = time() logger.info('Running process: {} with pid: {}'.format(p.name, p.pid)) try: while True: item = in_queue.get(timeout=60) uid, corpus = item[0], item[1] words_count_dict = DataPreProcessing.word_counts( document=corpus, remove_hyperlinks=True, remove_special_chars=True, remove_numbers=True, remove_chars_by_length=True, remove_char_length=2, remove_stop_words=True) out_queue.put((uid, words_count_dict)) # logger.info('Placed words of size: {} in words queue'.format(len(words))) except Empty: logger.info( 'For Process: {} with pid: {} No data found in the corpus queue for the last 60 seconds, ' 'preparing to terminate'.format(p.name, p.pid)) end_time = time() logger.info('Process: {} with pid: {} ran for {} seconds'.format( p.name, p.pid, end_time - start_time))
words_queue, ), name='corpus to word process') for i in range(10) ] generate_vocabulary_process = [ Process(target=generate_interim_vocabulary, args=( words_queue, results_queue, ), name='generate_interim_vocabulary process') for i in range(10) ] # Starting all Process # starting data generation process logger.info('Starting {} process having pid {}'.format( data_gen_process.name, data_gen_process.pid)) data_gen_process.start() # starting corpus to word process for process in corpus_to_words_process: process.daemon = True logger.info('Starting {} process having pid {}'.format( process.name, process.pid)) process.start() # starting generate vocabulary process for process in generate_vocabulary_process: process.daemon = True logger.info('Starting {} process having pid {}'.format( process.name, process.pid)) process.start()
def generate_interim_inverted_index(word_queue, result_queue): in_queue = word_queue buffer = Buffer(max_size=100) p = current_process() start_time = time() logger.info('Running process: {} with pid: {}'.format(p.name, p.pid)) # store the uid as key and words count as values try: while True: item = in_queue.get(timeout=100) try: if len(item) != 0: buffer.add(item) except OverflowError: items_dict = dict() for item in buffer.buffer_data: uid = item[0] words_count = item[1] items_dict[uid] = words_count InvertedIndex.update_inverted_index(items=items_dict, field='title') buffer.clear() logger.info( 'Created inverted_index of size {} in the Process: {} with pid: {}' .format(len(items_dict), p.name, p.pid)) logger.info( 'Current size of Intermediate Inverted index is {} in Process {} with pid:{}' .format(InvertedIndex.get_length(), p.name, p.pid)) logger.info( 'Placing the inverted index of size {} to results queue'. format(InvertedIndex.get_length())) result_queue.put(InvertedIndex.inverted_index) InvertedIndex.reset_inverted_index() # this block of code is executed if in_queue is empty for 10 seconds except Empty: items_dict = dict() if buffer.size() > 0: for item in buffer.buffer_data: uid = item[0] words_count = item[1] items_dict[uid] = words_count InvertedIndex.update_inverted_index(items=items_dict, field='title') buffer.clear() logger.info( "No data found in the words_queue for last 100 seconds") logger.info( "Emptying left over results in the buffer of size {} in Process with pid: {}" .format(len(items_dict), p.name, p.pid)) logger.info( 'Inverted Index is updated with the left over items in the buffer' ) logger.info( 'Inverted Index of size {} is placed in results queue'.format( InvertedIndex.get_length())) result_queue.put(InvertedIndex.inverted_index) end_time = time() logger.info('Process: {} with pid: {} ran for {} seconds'.format( p.name, p.pid, end_time - start_time))
words_queue, ), name='corpus to word process') for i in range(5) ] generate_interim_index_process = [ Process(target=generate_interim_inverted_index, args=( words_queue, results_queue, ), name='generate_interim_vocabulary process') for i in range(5) ] # Starting all Process # starting data generation process logger.info('Starting {} process having pid {}'.format( data_gen_process.name, data_gen_process.pid)) data_gen_process.start() # starting corpus to word process for process in corpus_to_words_process: process.daemon = True logger.info('Starting {} process having pid {}'.format( process.name, process.pid)) process.start() # starting generate vocabulary process for process in generate_interim_index_process: process.daemon = True logger.info('Starting {} process having pid {}'.format( process.name, process.pid)) process.start()
logger.info('Process: {} with pid: {} ran for {} seconds'.format(p.name, p.pid, end_time - start_time)) if __name__ == "__main__": corpus_queue = Queue(maxsize=100) words_count_queue = Queue() global_word_count_dict = dict() data_gen_process = Process(target=sentences_producer, args=(corpus_queue, LoadData(), ), name='sentence producer') corpus_to_words_count_process = [Process(target=corpus_to_words_count_producer, args=(corpus_queue, words_count_queue, ), name='corpus to words count process') for i in range(10)] # Starting all Process # starting data generation process logger.info('Starting {} process having pid {}'.format(data_gen_process.name, data_gen_process.pid)) data_gen_process.start() # starting corpus to words count process for process in corpus_to_words_count_process: process.daemon = True logger.info('Starting {} process having pid {}'.format(process.name, process.pid)) process.start() corpus_vocabulary = set() while any([process.is_alive() for process in corpus_to_words_count_process]) or words_count_queue.empty() is False: try: words_count_dict = words_count_queue.get(timeout=300) for word, word_count in words_count_dict.items(): if word not in global_word_count_dict: