def process(infile): # prepare and load file print '\nStarted job', infile data_file = load(infile) manager = Manager() manager.file = data_file # Build a pool of num_cpus processes pool = Pool(processes=int(num_cpus), initializer=init, initargs=(data_file, )) # Partition job into appropriate slices, prepare argument tuple for Map offset = abs(len(data_file) / num_cpus) print 'partitioned job to', num_cpus, 'slices of', humansize(offset) partitioned_jobs = list(chunkjobs(infile, data_file, offset)) data_file.close() # Generate count tuples for matched words from dictionary print 'Mapping job', infile single_count_tuples = pool.map(Map, partitioned_jobs) # Organize the count tuples; lists of tuples by token key token_to_tuples = Partition(single_count_tuples) # Collapse the lists of tuples into total term frequencies print 'Reducing job', infile term_frequencies = pool.map(Reduce, token_to_tuples.items()) # Sort the term frequencies in nonincreasing order term_frequencies.sort(tuple_sort) # dump dictionary from count, we want this sorted dumpdictionary(term_frequencies) # Output print 'top %d tokens by frequency' % int(args.top) for (index, pair) in enumerate(term_frequencies[:int(args.top)]): print index + 1, ':', pair[0], ':', pair[1] output = open('term_frequencies_' + os.path.splitext(infile)[0] + '.txt', 'wt') for pair in term_frequencies: output.write(str(pair[0]) + ': ' + str(pair[1]) + '\n') for n in percentages(pair): output.write(' %d: %3.2f%% ' % (n[0], n[1])) output.write('\n') output.close()
def process(infile): # prepare and load file print '\nStarted job', infile data_file = load(infile) # get either fslice or num_cpus global num_cpus num_cpus = getcpus(data_file) manager = Manager() manager.file = data_file # Partition job into appropriate slices, prepare argument tuple for Map print "Parsing job to sentences" offset = len(data_file) / num_cpus print 'partitioned',humansize(len(data_file)),'job to', num_cpus,\ 'slices of', humansize(offset) # Generate count tuples for matched words from dictionary print 'Preparing job', infile # Build a pool of num_cpus processes pool = Pool(processes=int(num_cpus), initializer=init, initargs=(data_file,)) # Map the job single_count_tuples = pool.map(Map,((i, infile) for i in xrange(num_cpus))) data_file.close() # Organize the count tuples; lists of tuples by token key token_to_tuples = Partition(single_count_tuples) # Collapse the lists of tuples into total term frequencies print 'Reducing job', infile term_frequencies = pool.map(Reduce, token_to_tuples.items()) # Sort the term frequencies in nonincreasing order term_frequencies.sort(tuple_sort) # dump dictionary from count, we want this sorted dumpdictionary(term_frequencies) # Output top term frequencies to console print 'top %d tokens by frequency' % int(args.top) for (index, pair) in enumerate(term_frequencies[:int(args.top)]): print '%1d %2s %3d' % (index + 1, pair[0] ,pair[1]) # Dump human-readable statistics dumpstats(term_frequencies)