Example #1
0
def process(infile):

  # prepare and load file

    print '\nStarted job', infile
    data_file = load(infile)
    manager = Manager()
    manager.file = data_file
    
  # Build a pool of num_cpus processes

    pool = Pool(processes=int(num_cpus), initializer=init,
                initargs=(data_file, ))

  # Partition job into appropriate slices, prepare argument tuple for Map

    offset = abs(len(data_file) / num_cpus)
    print 'partitioned job to', num_cpus, 'slices of', humansize(offset)
    partitioned_jobs = list(chunkjobs(infile, data_file, offset))
    data_file.close()
  # Generate count tuples for matched words from dictionary

    print 'Mapping job', infile
    single_count_tuples = pool.map(Map, partitioned_jobs)

  # Organize the count tuples; lists of tuples by token key

    token_to_tuples = Partition(single_count_tuples)

  # Collapse the lists of tuples into total term frequencies

    print 'Reducing job', infile
    term_frequencies = pool.map(Reduce, token_to_tuples.items())

  # Sort the term frequencies in nonincreasing order

    term_frequencies.sort(tuple_sort)

# dump dictionary from count, we want this sorted

    dumpdictionary(term_frequencies)

  # Output

    print 'top %d tokens by frequency' % int(args.top)
    for (index, pair) in enumerate(term_frequencies[:int(args.top)]):
        print index + 1, ':', pair[0], ':', pair[1]

    output = open('term_frequencies_' + os.path.splitext(infile)[0]
                  + '.txt', 'wt')
    for pair in term_frequencies:
        output.write(str(pair[0]) + ':  ' + str(pair[1]) + '\n')
        for n in percentages(pair):
            output.write(' %d: %3.2f%% ' % (n[0], n[1]))
        output.write('\n')

    output.close()
def process(infile):
    # prepare and load file
    print '\nStarted job', infile
    data_file = load(infile)
    # get either fslice or num_cpus
    global num_cpus
    num_cpus = getcpus(data_file)
    manager = Manager()
    manager.file = data_file

    # Partition job into appropriate slices, prepare argument tuple for Map
    print "Parsing job to sentences"
    offset = len(data_file) / num_cpus
    print 'partitioned',humansize(len(data_file)),'job to', num_cpus,\
	'slices of', humansize(offset)
    
    # Generate count tuples for matched words from dictionary
    print 'Preparing job', infile
    # Build a pool of num_cpus processes
    pool = Pool(processes=int(num_cpus), initializer=init, initargs=(data_file,))
    
    # Map the job
    single_count_tuples = pool.map(Map,((i, infile) for i in xrange(num_cpus)))
    data_file.close()
    
    # Organize the count tuples; lists of tuples by token key
    token_to_tuples = Partition(single_count_tuples)
    # Collapse the lists of tuples into total term frequencies

    print 'Reducing job', infile
    term_frequencies = pool.map(Reduce, token_to_tuples.items())

    # Sort the term frequencies in nonincreasing order

    term_frequencies.sort(tuple_sort)

    # dump dictionary from count, we want this sorted

    dumpdictionary(term_frequencies)

    # Output top term frequencies to console

    print 'top %d tokens by frequency' % int(args.top)
    for (index, pair) in enumerate(term_frequencies[:int(args.top)]):
	print '%1d %2s %3d' % (index + 1, pair[0] ,pair[1]) 
    
    # Dump human-readable statistics
    dumpstats(term_frequencies)