def process_file(n, prefix): """Process a single file.""" filename = ngram_filename(n, prefix) output_path = os.path.join(args.output, filename) with open_file_to_process(output_path, "w") as o: if o == False: print_status("Skipped", filename) raise FileAlreadyProcessed() print_status("Processing", filename) if numeric_token(prefix): return input_path = os.path.join(args.input, filename) with open(input_path, "r") as i: for line in itertools.filterfalse(contains_digits, i): o.write(line)
def right_integrate_counts(path, n): """ Given the path to a BinDB file of order n, generate an iterator over sorted ((n-1)gram, count) tuples created by integrating out the first token. """ print_status("Dumping", path, "to memory") ngrams_number = os.path.getsize(path) / bindb.line_size(n) # Format specifier for the numpy matrix used for sorting the mgrams dtp = ( # (n-1) * little-endian 4 byte integers with token indices [("w{}".format(i),"<i4") for i in range(n-1)] + # little-endian 8 byte integer with the count [("count","<i8")] ) # Progress bar code stages = tuple(range(1,26)) milestones = collections.deque(map( lambda s: (round(s / stages[-1] * ngrams_number), s), stages )) # Dump all right mgrams to a numpy array mgrams = numpy.zeros(ngrams_number, dtype=dtp) i = 0 with open(path, "rb") as f: for l in bindb.iter_bindb_file(f, n): mgrams[i] = l.ngram[1:] + (l.count,) i += 1 if i == milestones[0][0]: done = round(100 / milestones[-1][1] * milestones.popleft()[1]) print_status("{done}%".format(**locals())) # Sort the numpy array print_status("Sorting right integrated {n}grams".format(**locals())) mgrams.sort(order=["w{}".format(i) for i in range(n-1)]) print_status("Sorted right integrated {n}grams".format(**locals())) def numpy_row2bindb_line(numpy_row): """ Convert row of a numpy matrix with ngrams and counts to a BinDB line. """ return bindb.BinDBLine(tuple(numpy_row)[:-1], numpy_row["count"]) return integrate_counts(map(numpy_row2bindb_line, mgrams), bindb.BinDBLine)
def process_file(n): """Create a counts consistent BinDB table of order n.""" ngrams_filename = "{n}gram".format(**locals()) ngrams_input_path = os.path.join(args.input, ngrams_filename) ngrams_output_path = os.path.join(args.output, ngrams_filename) # The highest order table is consistent by definition if n == args.n_max: print_status("Copying", ngrams_input_path, "to", ngrams_output_path) shutil.copyfile(ngrams_input_path, ngrams_output_path) else: print_status("Creating counts-consistent {n}gram BinDB file".format( **locals())) # We need to use the already consistent table, hence reading ograms from # theoutput directory ograms_filename = "{}gram".format(n+1) ograms_path = os.path.join(args.output, ograms_filename) with open(ograms_path, "rb") as ograms_f, \ open(ngrams_input_path, "rb") as ngrams_input_f, \ open(ngrams_output_path, "wb") as ngrams_output_f: ograms = bindb.iter_bindb_file(ograms_f, n+1) # Make iterators over left and right integrated ograms left_integrated_ograms = integrate_counts( map(drop_last_token, ograms), bindb.BinDBLine ) right_integrated_ograms = right_integrate_counts(ograms_path, n+1) # Maximise counts of left and right integrated ograms integrated_ograms = maximise_counts( left_integrated_ograms, right_integrated_ograms, bindb.BinDBLine ) # Maximise counts of ngrams and integrated ograms ngrams = bindb.iter_bindb_file(ngrams_input_f, n) maximised_ngrams = maximise_counts(integrated_ograms, ngrams, bindb.BinDBLine) for l in maximised_ngrams: ngrams_output_f.write(bindb.pack_line(l, n)) print_status("Saved counts-consistent {n}gram BinDB file " "to".format(**locals()), ngrams_output_path)
from pysteg.googlebooks.ngrams_analysis import text2token_strings # Define and parse the script arguments parser = argparse.ArgumentParser(description=descr) parser.add_argument("-i", "--index", help="represent tokens using their indices") parser.add_argument("-n", "--normalise", action="store_true", help="normalise and explode tokens") args = parser.parse_args() # Load the index if args.index: print_status("Started loading index from", args.index) with open(args.index, "r") as f: index = bindb.BinDBIndex(f) print_status("Finished loading index") while True: try: text = input('--> ') except KeyboardInterrupt: print() break token_strings = text2token_strings(text) if args.normalise: token_strings = normalise_and_explode_tokens(token_strings)
def process_file(n, prefix): """ Process a single file. Since ngrams will change size and partition, they will be appended to existing files containing ngram counts from other prefix files. As a result, changes introduces by partial processing of a file cannot be rolled back easily -- there is no progress tracking, the whole script needs to be restarted from scratch if interrupted midway. """ filename = ngram_filename(n, prefix) path = os.path.join(args.input, filename) print_status("Processing", filename) # Dictionary of all possible output files out = dict() with open(path, "r") as i: for line in i: l_original = line.split("\t") # Normalise and explode original tokens l = tuple(normalise_and_explode_token(t) for t in l_original[:-1]) # Count the exploded size of each original token s = tuple(len(t) for t in l) # Discard ngrams with empty original edge tokens - a lower order # ngram already handles these counts if s[0] == 0 or s[-1] == 0: continue # There are at least two original tokens, so both edge tokens exist if n >= 2: # Count the total exploded size of middle original tokens, these # have to be included in the output middle_s = sum(s[1:-1]) # Count the maximum number of normalised tokens that can come # from the original edge tokens max_edge_s = args.n_max - middle_s # There are too many exploded middle tokens -- the normalised # ngram including at least one normalised token from each # original edge token would be beyond the order of the model if max_edge_s < 2: continue # Flatten the original middle tokens l_middle = tuple(itertools.chain.from_iterable(l[1:-1])) # Consider every combination of normalised edge tokens -- they # need to be adjacent to the middle tokens for ls in range(1,min(max_edge_s,s[0])+1): for rs in range(1,min(max_edge_s-ls,s[-1])+1): output_ngram(l[0][-ls:] + l_middle + l[-1][:rs], l_original[-1], out) # There is only one original token else: for start in range(s[0]): for stop in range(start+1, min(start+args.n_max,s[0])+1): output_ngram(l[0][start:stop], l_original[-1], out) close_output_files(out) print_status("Finished", filename)
def write_ngrams_table(n, prefixes): """Writes ngrams counts table for a particular n.""" def pref_path(pref): """Give path to a prefix file.""" return os.path.join(args.input, ngram_filename(n, pref)) # Prepare a part2pref dictionary of prefixes corresponding to partitions part2pref = {part: set() for part in BS_PARTITION_NAMES} for pref in prefixes: # Determine which prefix files actually exist. This introduces a race # condition, however the assumption is that database will not be # modified while this script is running. if os.path.exists(pref_path(pref)): if pref in BS_SPECIAL_PREFIXES: part2pref["_"].add(pref) else: part2pref[pref[0]].add(pref) # Format specifier for a line of the bindb file fmt = bindb.fmt(n) # Format specifier for the numpy matrix used for sorting the ngrams dtp = ( # n * little-endian 4 byte integers with token indices [("w{}".format(i), "<i4") for i in range(n)] + # little-endian 8 byte integer with ngram count [("f", "<i8")]) # Create the bindb file output_path = os.path.join(args.output, "{n}gram".format(**locals())) with open(output_path, "wb") as fo: # Go over the prefix files for each possible partitions for part in BS_PARTITION_NAMES: # Sort the set of prefixes which will contribute to this partition # to take advantage of partial sorting (ngrams belonging to the same # prefix will still be adjacent in the sorted partition) prefs = sorted(part2pref[part]) # Calculate the maximum number of ngrams in the partition by # counting total number of lines in each prefix file ngrams_maxn = sum( sum(1 for line in open(pref_path(pref), "r")) for pref in prefs) # Create a numpy array that can contain all potential ngrams ngrams = zeros(ngrams_maxn, dtype=dtp) # Read one by one prefix files corresponding to the partition i = 0 for pref in prefs: # Simultaneously read ngrams from the prefix file and write # those which don't match to the error file filename = ngram_filename(n, pref) input_path = os.path.join(args.input, filename) error_path = os.path.join(args.error, filename) with open(input_path, "r") as fi, open(error_path, "w") as fe: for line in fi: ngram = line[:-1].split("\t") try: # Translate all tokens to their indices ixs = tuple(map(index.s2i, ngram[:-1])) # Assert that the partition is correct assert (index.s2p(ngram[0]) == part) # Add the ngram ngrams[i] = ixs + (int(ngram[-1]), ) i += 1 # If the partition doesn't match or the token cannot be # found in the index except (AssertionError, KeyError): fe.write(line) print_status("Read and indexed ngrams from", input_path) ngrams_n = i # Sort the partition ngrams = ngrams[:ngrams_n] ngrams.sort(order=["w{}".format(i) for i in range(n)]) print_status(ngrams_n, "ngrams sorted") # Write lines to the binary counts file out_count = 0 current_ngram = tuple() current_f = 0 for i in range(ngrams_n): ngram_i = tuple(ngrams[i])[:-1] # Compare this ngram to the currently deduplicated ngram if ngram_i == current_ngram: current_f += ngrams[i]["f"] else: if i != 0: fo.write( struct.pack(fmt, *current_ngram + (current_f, ))) out_count += 1 current_ngram = ngram_i current_f = ngrams[i]["f"] # Write a line in the last loop iteration if i == ngrams_n - 1: fo.write(struct.pack(fmt, *current_ngram + (current_f, ))) out_count += 1 print_status(out_count, "ngrams integrated and saved to", output_path)