def process_file(descr): """Process a single file.""" while not allowed_to_dispatch(): time.sleep(300) n, prefix = descr local_root = args.output remote_root = "http://storage.googleapis.com/books/ngrams/books/" filename = ngram_filename(n, prefix) local_path = os.path.join(local_root, filename) remote_path = urllib.parse.urljoin(remote_root, filename + ".gz") def print_status(message, filename): time = datetime.datetime.now() print("{time} {message} {filename}".format(**locals())) with open_file_to_process(local_path, "wb") as f: if f == False: print_status("Skipped", filename) raise FileAlreadyProcessed() print_status("Processing", filename) # Generate iterators over ngrams source_ngrams = iter_remote_gzip(remote_path) processed_ngrams = integrate_pure_ngram_counts(source_ngrams, n) # Save the integrated ngram counts to a file ngrams_iter2file(processed_ngrams, f) print_status("Finished", filename)
def process_file(n, prefix): """Process a single file.""" filename = ngram_filename(n, prefix) output_path = os.path.join(args.output, filename) with open_file_to_process(output_path, "w") as o: if o == False: print_status("Skipped", filename) raise FileAlreadyProcessed() print_status("Processing", filename) if numeric_token(prefix): return input_path = os.path.join(args.input, filename) with open(input_path, "r") as i: for line in itertools.filterfalse(contains_digits, i): o.write(line)
def output_ngram(l, count, out): """ Output a normalised ngram to an appropriate file. The input ngram includes empty tokens. """ n = len(l) # See if an appropriate output file is already open prefix = normalised_token_prefix(l[0], n) if (n, prefix) not in out: # Close all files if too many are open. The elegant way would be to # maintain the files in the order of last access and close only the one # that was accessed longest time ago, but this hack works for now and # efficiency is not key in this script. if len(out) > 1000: close_output_files(out) filename = ngram_filename(n, prefix) path = os.path.join(args.output, filename) out[(n, prefix)] = open(path, "a") # Write the ngram to the output file out[(n, prefix)].write("\t".join(l + (count,)))
with open(args.ngrams, 'r') as f: ngrams = json.load(f) # Partitions are the 1gram prefixes ordered alphabetically partitions = sorted(ngrams["1"]) partitions_set = frozenset(partitions) # Dictionary holding cumulative frequency ranges in each partition cumfreq_ranges = {} for n in sorted(ngrams.keys()): # Calculate total frequencies in each partition cumfreqs = {} for prefix in ngrams[n]: partition = get_partition(prefix, partitions_set) path = os.path.join(args.input, ngram_filename(n,prefix)) cumfreqs[partition] = (cumfreqs.get(partition, 0) + calculate_cumfreq(path)) print("Counted cumulative frequency for FILE {path}".format( **locals())) # Calculate cumulative frequency ranges in each partition cumfreq_ranges[n] = {} cumfreq = 0 for partition in partitions: cumfreq_ranges[n][partition] = (cumfreq, cumfreq + cumfreqs[partition]) cumfreq += cumfreqs[partition] with open(args.output, "w") as f: json.dump(cumfreq_ranges, f)
def upload_ngrams(n, prefixes, index_ranges, cumfreq_ranges): """Upload ngrams for a particular n to the PostgreSQL database.""" def get_column_definitions(n): return ",\n".join( map(lambda x: "w{} INTEGER".format(x), range(1, n + 1))) def get_column_names(n): return ", ".join(map(lambda x: "w{}".format(x), range(1, n + 1))) # Generate table and columns definitions table = get_table_name(args.dataset, "{n}grams".format(**locals())) context_table = get_table_name(args.dataset, "{n}grams__context".format(**locals())) column_definitions = get_column_definitions(n) columns = get_column_names(n) if not is_completed("{n}grams_create_parent_tables".format(**locals())): # Create parent ngrams table cur.execute(""" DROP TABLE IF EXISTS {table} CASCADE; CREATE TABLE {table} ( i SERIAL, {column_definitions}, cf1 BIGINT, cf2 BIGINT ); """.format(**locals())) print("Created TABLE {table}".format(**locals())) # Create parent context table if n > 1: context_column_definitions = get_column_definitions(n - 1) context_columns = get_column_names(n - 1) cur.execute(""" DROP TABLE IF EXISTS {context_table} CASCADE; CREATE TABLE {context_table} ( i SERIAL, {context_column_definitions}, cf1 BIGINT, cf2 BIGINT ); """.format(**locals())) else: cur.execute(""" DROP TABLE IF EXISTS {context_table}; CREATE TABLE {context_table} ( i SERIAL PRIMARY KEY, cf1 BIGINT, cf2 BIGINT ); """.format(**locals())) print("Created context TABLE {context_table}".format(**locals())) # Commit defining parent tables conn.commit() complete("{n}grams_create_parent_tables".format(**locals())) # Populate respective partition tables for partition in sorted(prefixes.keys()): if is_completed( "{n}grams_{partition}_analyse_partition".format(**locals())): continue # Define various properties of the partition table, such as its name and # the range of data it is supposed to contain partition_table = get_table_name( args.dataset, "{n}grams_{partition}".format(**locals())) index_range = index_ranges[partition] cumfreq_range = cumfreq_ranges[partition] if not is_completed( "{n}grams_{partition}_create_tables".format(**locals())): # Create the partition table cur.execute(""" DROP TABLE IF EXISTS {partition_table}; CREATE TABLE {partition_table} ( PRIMARY KEY (i), CHECK ( w1 >= {index_range[0]} AND w1 <= {index_range[1]} AND cf1 >= {cumfreq_range[0]} AND cf1 <= {cumfreq_range[1]} AND cf2 >= {cumfreq_range[0]} AND cf2 <= {cumfreq_range[1]} ) ) INHERITS ({table}); """.format(**locals())) print( "Created partition TABLE {partition_table}".format(**locals())) # If n > 1, then data in the context table should be partitioned too if n > 1: context_partition_table = get_table_name( args.dataset, "{n}grams_{partition}__context".format(**locals())) cur.execute(""" DROP TABLE IF EXISTS {context_partition_table}; CREATE TABLE {context_partition_table} ( PRIMARY KEY (i), CHECK ( w1 >= {index_range[0]} AND w1 <= {index_range[1]} AND cf1 >= {cumfreq_range[0]} AND cf1 <= {cumfreq_range[1]} AND cf2 >= {cumfreq_range[0]} AND cf2 <= {cumfreq_range[1]} ) ) INHERITS ({context_table}); """.format(**locals())) print("Created context partition TABLE " "{context_partition_table}".format(**locals())) # Commit creating ngrams and context partition tables conn.commit() complete("{n}grams_{partition}_create_tables".format(**locals())) for prefix in prefixes[partition]: if is_completed( "{n}grams_{prefix}_analyse_prefix".format(**locals())): continue path = os.path.join(args.input, ngram_filename(n, prefix)) raw_tmp_table = get_table_name( args.dataset, "tmp_raw__{n}grams_{prefix}".format(**locals())) cumfreq_tmp_table = get_table_name( args.dataset, "tmp_cumfreq__{n}grams_{prefix}".format(**locals())) # Copy ngrams starting with a particular prefix into a temporary # table and cumulate their frequencies cur.execute( """ DROP TABLE IF EXISTS {raw_tmp_table}; CREATE TABLE {raw_tmp_table} ( i SERIAL PRIMARY KEY, {column_definitions}, f BIGINT ); DROP TABLE IF EXISTS {cumfreq_tmp_table}; CREATE TABLE {cumfreq_tmp_table} ( i SERIAL PRIMARY KEY, {column_definitions}, cf1 BIGINT, cf2 BIGINT ); COPY {raw_tmp_table} ({columns}, f) FROM %s; INSERT INTO {cumfreq_tmp_table} ({columns}, cf1, cf2) SELECT {columns}, sum(f) OVER (ORDER BY {columns} ASC) - f + (SELECT coalesce(max(cf2),0) FROM {table}) AS cf1, sum(f) OVER (ORDER BY {columns} ASC) + (SELECT coalesce(max(cf2),0) FROM {table}) AS cf2 FROM {raw_tmp_table}; DROP TABLE {raw_tmp_table}; """.format(**locals()), (path, )) print("Copied FILE {path} to TABLE {cumfreq_tmp_table}".format( **locals())) # Insert ngrams with this prefix into the partition table cur.execute(""" INSERT INTO {partition_table} ({columns}, cf1, cf2) SELECT {columns}, cf1, cf2 FROM {cumfreq_tmp_table} ORDER BY i ASC; """.format(**locals())) print("Copied TABLE {cumfreq_tmp_table} to TABLE " "{partition_table}".format(**locals())) # Insert ngrams with this prefix into the context partition table if n > 1: cur.execute(""" INSERT INTO {context_partition_table} ({context_columns}, cf1, cf2) SELECT {context_columns}, min(cf1) AS cf1, max(cf2) AS cf2 FROM {cumfreq_tmp_table} GROUP BY {context_columns} -- This is much faster than "ORDER BY min(i)", can investigate ORDER BY {context_columns} ASC; """.format(**locals())) print( "Cumulated and copied TABLE {cumfreq_tmp_table} to TABLE " "{context_partition_table}".format(**locals())) cur.execute(""" DROP TABLE {cumfreq_tmp_table}; """.format(**locals())) # Commit changes due to processing a single prefix file conn.commit() complete("{n}grams_{prefix}_analyse_prefix".format(**locals())) # Index the ngrams partition table. Making the index on columns unique # ensures that no leaves of the probability tree are duplicated. cur.execute(""" CREATE UNIQUE INDEX ON {partition_table} USING btree ({columns}) WITH (fillfactor = 100); CREATE UNIQUE INDEX ON {partition_table} USING btree (cf1, cf2) WITH (fillfactor = 100); """.format(**locals())) print("Created UNIQUE INDEXES on ({columns}) and (cf1, cf2) in TABLE " "{partition_table}".format(**locals())) # Index the ngrams context partition table. Since ngrams are added from # the prefix files sequentially, if it happened that two ngrams starting # with the same (w1, ..., w(n-1)) were wrongly put in different prefix # files, an error will occur. Ngrams starting with the same (w1, ..., # w(n-2)) are not a problem, since we will always query for P(w(n) | w1, # ..., w(n-1)). if n > 1: cur.execute(""" CREATE UNIQUE INDEX ON {context_partition_table} USING btree ({context_columns}) WITH (fillfactor = 100); """.format(**locals())) print("Created UNIQUE INDEX on ({context_columns}) in TABLE " "{context_partition_table}".format(**locals())) # Commit indexing ngrams and context tables after processing all # corresponding prefix files conn.commit() complete("{n}grams_{partition}_analyse_partition".format(**locals())) # Create context for 1 grams if n == 1: cur.execute(""" INSERT INTO {context_table} (cf1, cf2) SELECT min(cf1) AS cf1, max(cf2) AS cf2 FROM {table}; """.format(**locals())) print("Cumulated and copied TABLE {table} to TABLE " "{context_table}".format(**locals())) # Commit creating context for 1grams conn.commit() complete("{n}grams_analyse".format(**locals()))
def process_file(n, prefix): """ Process a single file. Since ngrams will change size and partition, they will be appended to existing files containing ngram counts from other prefix files. As a result, changes introduces by partial processing of a file cannot be rolled back easily -- there is no progress tracking, the whole script needs to be restarted from scratch if interrupted midway. """ filename = ngram_filename(n, prefix) path = os.path.join(args.input, filename) print_status("Processing", filename) # Dictionary of all possible output files out = dict() with open(path, "r") as i: for line in i: l_original = line.split("\t") # Normalise and explode original tokens l = tuple(normalise_and_explode_token(t) for t in l_original[:-1]) # Count the exploded size of each original token s = tuple(len(t) for t in l) # Discard ngrams with empty original edge tokens - a lower order # ngram already handles these counts if s[0] == 0 or s[-1] == 0: continue # There are at least two original tokens, so both edge tokens exist if n >= 2: # Count the total exploded size of middle original tokens, these # have to be included in the output middle_s = sum(s[1:-1]) # Count the maximum number of normalised tokens that can come # from the original edge tokens max_edge_s = args.n_max - middle_s # There are too many exploded middle tokens -- the normalised # ngram including at least one normalised token from each # original edge token would be beyond the order of the model if max_edge_s < 2: continue # Flatten the original middle tokens l_middle = tuple(itertools.chain.from_iterable(l[1:-1])) # Consider every combination of normalised edge tokens -- they # need to be adjacent to the middle tokens for ls in range(1,min(max_edge_s,s[0])+1): for rs in range(1,min(max_edge_s-ls,s[-1])+1): output_ngram(l[0][-ls:] + l_middle + l[-1][:rs], l_original[-1], out) # There is only one original token else: for start in range(s[0]): for stop in range(start+1, min(start+args.n_max,s[0])+1): output_ngram(l[0][start:stop], l_original[-1], out) close_output_files(out) print_status("Finished", filename)
# partition "_" that words from BS_SPECIAL_PREFIXES belong to. part2pref = {p:(p,) for p in ngrams["1"] if p not in BS_SPECIAL_PREFIXES} part2pref["_"] = BS_SPECIAL_PREFIXES # Verify that the implicitly created partitions are correct assert(set(part2pref.keys()) == set(BS_PARTITION_NAMES)) # Go over all partitions and read words from the corresponding prefix files gen_index = count(1) with open(args.output, "w") as fo: for part in BS_PARTITION_NAMES: # Initialise all words if part == "_": words = {"_START_", "_END_"} else: words = set() # Read words from respective prefix files for pref in part2pref[part]: path = os.path.join(args.input, ngram_filename(1, pref)) if os.path.isfile(path): with open(path, "r") as fi: for line in fi: words.add(line.split("\t")[0]) print("Read words from {path}".format(**locals())) # Dump words to the index file for w, i in zip(sorted(words), gen_index): fo.write("{i}\t{w}\t{part}\n".format(**locals())) print("Dumped {part} partition".format(**locals()))
def pref_path(pref): """Give path to a prefix file.""" return os.path.join(args.input, ngram_filename(n, pref))
def write_ngrams_table(n, prefixes): """Writes ngrams counts table for a particular n.""" def pref_path(pref): """Give path to a prefix file.""" return os.path.join(args.input, ngram_filename(n, pref)) # Prepare a part2pref dictionary of prefixes corresponding to partitions part2pref = {part: set() for part in BS_PARTITION_NAMES} for pref in prefixes: # Determine which prefix files actually exist. This introduces a race # condition, however the assumption is that database will not be # modified while this script is running. if os.path.exists(pref_path(pref)): if pref in BS_SPECIAL_PREFIXES: part2pref["_"].add(pref) else: part2pref[pref[0]].add(pref) # Format specifier for a line of the bindb file fmt = bindb.fmt(n) # Format specifier for the numpy matrix used for sorting the ngrams dtp = ( # n * little-endian 4 byte integers with token indices [("w{}".format(i), "<i4") for i in range(n)] + # little-endian 8 byte integer with ngram count [("f", "<i8")]) # Create the bindb file output_path = os.path.join(args.output, "{n}gram".format(**locals())) with open(output_path, "wb") as fo: # Go over the prefix files for each possible partitions for part in BS_PARTITION_NAMES: # Sort the set of prefixes which will contribute to this partition # to take advantage of partial sorting (ngrams belonging to the same # prefix will still be adjacent in the sorted partition) prefs = sorted(part2pref[part]) # Calculate the maximum number of ngrams in the partition by # counting total number of lines in each prefix file ngrams_maxn = sum( sum(1 for line in open(pref_path(pref), "r")) for pref in prefs) # Create a numpy array that can contain all potential ngrams ngrams = zeros(ngrams_maxn, dtype=dtp) # Read one by one prefix files corresponding to the partition i = 0 for pref in prefs: # Simultaneously read ngrams from the prefix file and write # those which don't match to the error file filename = ngram_filename(n, pref) input_path = os.path.join(args.input, filename) error_path = os.path.join(args.error, filename) with open(input_path, "r") as fi, open(error_path, "w") as fe: for line in fi: ngram = line[:-1].split("\t") try: # Translate all tokens to their indices ixs = tuple(map(index.s2i, ngram[:-1])) # Assert that the partition is correct assert (index.s2p(ngram[0]) == part) # Add the ngram ngrams[i] = ixs + (int(ngram[-1]), ) i += 1 # If the partition doesn't match or the token cannot be # found in the index except (AssertionError, KeyError): fe.write(line) print_status("Read and indexed ngrams from", input_path) ngrams_n = i # Sort the partition ngrams = ngrams[:ngrams_n] ngrams.sort(order=["w{}".format(i) for i in range(n)]) print_status(ngrams_n, "ngrams sorted") # Write lines to the binary counts file out_count = 0 current_ngram = tuple() current_f = 0 for i in range(ngrams_n): ngram_i = tuple(ngrams[i])[:-1] # Compare this ngram to the currently deduplicated ngram if ngram_i == current_ngram: current_f += ngrams[i]["f"] else: if i != 0: fo.write( struct.pack(fmt, *current_ngram + (current_f, ))) out_count += 1 current_ngram = ngram_i current_f = ngrams[i]["f"] # Write a line in the last loop iteration if i == ngrams_n - 1: fo.write(struct.pack(fmt, *current_ngram + (current_f, ))) out_count += 1 print_status(out_count, "ngrams integrated and saved to", output_path)