Ejemplo n.º 1
0
def missingNumber_4(filename):
    '''
    This implementation uses multiprocessing to evaulate chunks of input
    and distribute the tally operation across the cores.
    '''

    DEFAULT_MAX_CHUNK_SIZE = 10000

    mapper = SimpleMapReduce(map_linenumbers, reducer)

    with open(filename, 'r') as fh:
        results = mapper(fh, chunksize=DEFAULT_MAX_CHUNK_SIZE)
        print(results)
    # count lines
    # get sum of all numbers
    # get guassian sum
    # subtract sum from guassiang sum

    return results
Ejemplo n.º 2
0
                    output.append((word, 1))
    return output


def count_words(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurences.
    """
    word, occurences = item
    return (word, sum(occurences))


if __name__ == '__main__':
    import operator
    import glob
    import os
    import pathlib

    # input_files = glob.glob(os.path.join(os.path.dirname(__file__), '*.rst'))
    input_files = list(pathlib.Path(__file__).parent.glob('*.rst'))
    mapper = SimpleMapReduce(file_to_words, count_words)
    word_counts = mapper(input_files)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()
    print('\nTOP 20 WORDS BY FREQUENCY\n')
    top20 = word_counts[:20]
    longest = max(len(word) for word, count in top20)
    for word, count in top20:
        print('{word:<{len}}: {count:5}'.format(len=longest + 1,
                                                word=word,
                                                count=count))
Ejemplo n.º 3
0
    output format:
    [(word, 1), ...,(word, 1)]
    """
    return [(filename, filesize)]


def count_file_size(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurances.
    """
    filename, filesize = item

    return (filename, filesize)


if __name__ == '__main__':
    import glob

    input_files = glob.glob('/tmp/*.rst')

    mapper = SimpleMapReduce(get_file_size, count_file_size)
    all_filename_size_info = mapper(input_files)

    sorted_info = sorted(all_filename_size_info, key=operator.itemgetter(1))

    sorted_info.reverse()

    top_10 = sorted_info[:10]

    for filename, filesize in top_10:
        print '%s:        %d' % (filename, filesize)
Ejemplo n.º 4
0
if not os.path.exists(jour_dir):
    os.makedirs(jour_dir)

logging.basicConfig(level=logging_level,\
              filename=os.path.join(args.logdir,"db2bibtex.log"),\
              format="%(asctime)s %(name)-10s %(levelname)-8s %(message)s",
              filemode="a")
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(name)-10s %(levelname)-8s %(message)s")
console.setFormatter(formatter)
logging.getLogger("").addHandler(console)
logging.debug("logging configuration done")


if __name__ == '__main__':

    cnx = mysql.connector.connect(**acmdldb)
    cursor = cnx.cursor(buffered = True)
    cursor.execute("SELECT iid, article_id, title, year, issn, number, volume, venue_name, page_from, page_to, editors, publisher_name, publisher_address, doi_number, venue_type FROM PAPER WHERE article_id<=999000") 

    dbrecs = cursor.fetchall()
    cursor.close()
    cnx.close()

    mapper = SimpleMapReduce(dbrec_to_bibtex,count_type,num_workers=50)
    bibtex_types = mapper(dbrecs)

    for typename, count in bibtex_types:
        print('{}: {}'.format(typename,count))