Exemple #1
0
def sentences_producer(queue, data):
    """
    puts the list of sentences or a single sentence in the queue generated by the research paper
    :param queue: queue object
    :param data: data generator object
    :return:
    """
    count = 0
    start_time = time()
    p = current_process()
    logger.info('Running process: {} with pid: {}'.format(p.name, p.pid))
    for datum in data.get_datum():
        count += 1
        if count % 1000 == 0:
            logger.info(
                'Approx: {} files have been processed so far'.format(count))
        rp = ResearchPaper(r_paper=datum)
        sentences = rp.get_r_paper_title()
        uid = rp.get_r_paper_id()
        if sentences is not None and len(sentences) > 0:
            # tuple with first term as uid and second term as sentences
            queue.put((uid, sentences))
    end_time = time()
    logger.info('Exiting process {} with pid: {}'.format(p.name, p.pid))
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(
        p.name, p.pid, end_time - start_time))
def generate_interim_vocabulary(word_queue, result_queue):
    in_queue = word_queue
    buffer = Buffer(max_size=1000)
    p = current_process()
    start_time = time()
    logger.info('Running process: {} with pid: {}'.format(p.name, p.pid))
    try:
        while True:
            words = in_queue.get(timeout=100)
            if words is None or len(words) == 0:
                continue
            else:
                words_set = set(words)
            try:
                buffer.add(words_set)
            except OverflowError:
                tmp_array = list()
                for item in buffer.buffer_data:
                    tmp_array.extend(item)

                buffer.clear()
                results = list(set(tmp_array))
                logger.info(
                    'Writing {} results to results queue for Process: {} with pid: {}'
                    .format(len(results), p.name, p.pid))
                result_queue.put(results)
    # this block of code is executed if in_queue is empty for 10 seconds
    except Empty:
        tmp_array = list()
        if buffer.size() > 0:
            for item in buffer.buffer_data:
                tmp_array.extend(item)

            buffer.clear()
            logger.info(
                "No data found in the words_queue for last 100 seconds")
            results = list(set(tmp_array))
            logger.info(
                "Emptying left over results in the buffer of size {}".format(
                    len(results)))
            result_queue.put(results)
    end_time = time()
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(
        p.name, p.pid, end_time - start_time))
def corpus_to_words_producer(corp_queue, word_queue):
    in_queue = corp_queue
    out_queue = word_queue
    p = current_process()
    start_time = time()
    logger.info('Running process: {} with pid: {}'.format(p.name, p.pid))
    try:
        while True:
            corpus = in_queue.get(timeout=60)
            unique_words = DataPreProcessing.unique_words(document=corpus)
            filtered_words = DataPreProcessing.remove_hyperlinks(unique_words)
            processed_words = DataPreProcessing.remove_special_chars(
                filtered_words)
            filtered_words = DataPreProcessing.remove_numbers(processed_words)
            filtered_words = DataPreProcessing.remove_by_length(filtered_words,
                                                                length=2)
            filtered_words = DataPreProcessing.remove_stop_words(
                words=filtered_words)
            stemmed_words = DataPreProcessing.stemmer(filtered_words)

            out_queue.put(stemmed_words)
            # logger.info('Placed words of size: {} in words queue'.format(len(words)))
    except Empty:
        logger.info(
            'For Process: {} with pid: {} No data found in the corpus queue for the last 60 seconds, '
            'preparing to terminate'.format(p.name, p.pid))
    end_time = time()
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(
        p.name, p.pid, end_time - start_time))
Exemple #4
0
def corpus_to_words_producer(corp_queue, word_queue):
    in_queue = corp_queue
    out_queue = word_queue
    p = current_process()
    start_time = time()
    logger.info('Running process: {} with pid: {}'.format(p.name, p.pid))
    try:
        while True:
            item = in_queue.get(timeout=60)
            uid, corpus = item[0], item[1]
            words_count_dict = DataPreProcessing.word_counts(
                document=corpus,
                remove_hyperlinks=True,
                remove_special_chars=True,
                remove_numbers=True,
                remove_chars_by_length=True,
                remove_char_length=2,
                remove_stop_words=True)
            out_queue.put((uid, words_count_dict))
            # logger.info('Placed words of size: {} in words queue'.format(len(words)))
    except Empty:
        logger.info(
            'For Process: {} with pid: {} No data found in the corpus queue for the last 60 seconds, '
            'preparing to terminate'.format(p.name, p.pid))
    end_time = time()
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(
        p.name, p.pid, end_time - start_time))
                    words_queue,
                ),
                name='corpus to word process') for i in range(10)
    ]
    generate_vocabulary_process = [
        Process(target=generate_interim_vocabulary,
                args=(
                    words_queue,
                    results_queue,
                ),
                name='generate_interim_vocabulary process') for i in range(10)
    ]

    # Starting all Process
    # starting data generation process
    logger.info('Starting {} process having pid {}'.format(
        data_gen_process.name, data_gen_process.pid))
    data_gen_process.start()
    # starting corpus to word process
    for process in corpus_to_words_process:
        process.daemon = True
        logger.info('Starting {} process having pid {}'.format(
            process.name, process.pid))
        process.start()

    # starting generate vocabulary process
    for process in generate_vocabulary_process:
        process.daemon = True
        logger.info('Starting {} process having pid {}'.format(
            process.name, process.pid))
        process.start()
Exemple #6
0
def generate_interim_inverted_index(word_queue, result_queue):
    in_queue = word_queue
    buffer = Buffer(max_size=100)
    p = current_process()
    start_time = time()
    logger.info('Running process: {} with pid: {}'.format(p.name, p.pid))
    # store the uid as key and words count as values

    try:
        while True:
            item = in_queue.get(timeout=100)
            try:
                if len(item) != 0:
                    buffer.add(item)
            except OverflowError:
                items_dict = dict()
                for item in buffer.buffer_data:
                    uid = item[0]
                    words_count = item[1]
                    items_dict[uid] = words_count

                InvertedIndex.update_inverted_index(items=items_dict,
                                                    field='title')
                buffer.clear()
                logger.info(
                    'Created inverted_index of size {} in the Process: {} with pid: {}'
                    .format(len(items_dict), p.name, p.pid))
                logger.info(
                    'Current size of Intermediate Inverted index is {} in Process {} with pid:{}'
                    .format(InvertedIndex.get_length(), p.name, p.pid))
                logger.info(
                    'Placing the inverted index of size {} to results queue'.
                    format(InvertedIndex.get_length()))
                result_queue.put(InvertedIndex.inverted_index)
                InvertedIndex.reset_inverted_index()

    # this block of code is executed if in_queue is empty for 10 seconds
    except Empty:
        items_dict = dict()
        if buffer.size() > 0:
            for item in buffer.buffer_data:
                uid = item[0]
                words_count = item[1]
                items_dict[uid] = words_count

            InvertedIndex.update_inverted_index(items=items_dict,
                                                field='title')
            buffer.clear()
            logger.info(
                "No data found in the words_queue for last 100 seconds")
            logger.info(
                "Emptying left over results in the buffer of size {} in Process with pid: {}"
                .format(len(items_dict), p.name, p.pid))
            logger.info(
                'Inverted Index is updated with the left over items in the buffer'
            )
            logger.info(
                'Inverted Index of size {} is placed in results queue'.format(
                    InvertedIndex.get_length()))
            result_queue.put(InvertedIndex.inverted_index)
    end_time = time()
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(
        p.name, p.pid, end_time - start_time))
Exemple #7
0
                    words_queue,
                ),
                name='corpus to word process') for i in range(5)
    ]
    generate_interim_index_process = [
        Process(target=generate_interim_inverted_index,
                args=(
                    words_queue,
                    results_queue,
                ),
                name='generate_interim_vocabulary process') for i in range(5)
    ]

    # Starting all Process
    # starting data generation process
    logger.info('Starting {} process having pid {}'.format(
        data_gen_process.name, data_gen_process.pid))
    data_gen_process.start()
    # starting corpus to word process
    for process in corpus_to_words_process:
        process.daemon = True
        logger.info('Starting {} process having pid {}'.format(
            process.name, process.pid))
        process.start()

    # starting generate vocabulary process
    for process in generate_interim_index_process:
        process.daemon = True
        logger.info('Starting {} process having pid {}'.format(
            process.name, process.pid))
        process.start()
Exemple #8
0
    logger.info('Process: {} with pid: {} ran for {} seconds'.format(p.name, p.pid, end_time - start_time))


if __name__ == "__main__":
    corpus_queue = Queue(maxsize=100)
    words_count_queue = Queue()
    global_word_count_dict = dict()

    data_gen_process = Process(target=sentences_producer, args=(corpus_queue, LoadData(), ), name='sentence producer')

    corpus_to_words_count_process = [Process(target=corpus_to_words_count_producer, args=(corpus_queue, words_count_queue, ),
                                       name='corpus to words count process') for i in range(10)]

    # Starting all Process
    # starting data generation process
    logger.info('Starting {} process having pid {}'.format(data_gen_process.name, data_gen_process.pid))
    data_gen_process.start()

    # starting corpus to words count process
    for process in corpus_to_words_count_process:
        process.daemon = True
        logger.info('Starting {} process having pid {}'.format(process.name, process.pid))
        process.start()

    corpus_vocabulary = set()

    while any([process.is_alive() for process in corpus_to_words_count_process]) or words_count_queue.empty() is False:
        try:
            words_count_dict = words_count_queue.get(timeout=300)
            for word, word_count in words_count_dict.items():
                if word not in global_word_count_dict: