Example #1
0
def process_file(n, prefix):
    """Process a single file."""

    filename = ngram_filename(n, prefix)
    output_path = os.path.join(args.output, filename)

    with open_file_to_process(output_path, "w") as o:
        if o == False:
            print_status("Skipped", filename)
            raise FileAlreadyProcessed()

        print_status("Processing", filename)

        if numeric_token(prefix):
            return

        input_path = os.path.join(args.input, filename)
        with open(input_path, "r") as i:
            for line in itertools.filterfalse(contains_digits, i):
                o.write(line)
def right_integrate_counts(path, n):
    """
    Given the path to a BinDB file of order n, generate an iterator over
    sorted ((n-1)gram, count) tuples created by integrating out the first token.
    """

    print_status("Dumping", path, "to memory")

    ngrams_number = os.path.getsize(path) / bindb.line_size(n)

    # Format specifier for the numpy matrix used for sorting the mgrams
    dtp = (
        # (n-1) * little-endian 4 byte integers with token indices
        [("w{}".format(i),"<i4") for i in range(n-1)] +
        # little-endian 8 byte integer with the count
        [("count","<i8")]
    )

    # Progress bar code
    stages = tuple(range(1,26))
    milestones = collections.deque(map(
        lambda s: (round(s / stages[-1] * ngrams_number), s), stages
    ))

    # Dump all right mgrams to a numpy array
    mgrams = numpy.zeros(ngrams_number, dtype=dtp)
    i = 0

    with open(path, "rb") as f:
        for l in bindb.iter_bindb_file(f, n):
            mgrams[i] = l.ngram[1:] + (l.count,)
            i += 1

            if i == milestones[0][0]:
                done = round(100 / milestones[-1][1] * milestones.popleft()[1])
                print_status("{done}%".format(**locals()))

    # Sort the numpy array
    print_status("Sorting right integrated {n}grams".format(**locals()))
    mgrams.sort(order=["w{}".format(i) for i in range(n-1)])
    print_status("Sorted right integrated {n}grams".format(**locals()))

    def numpy_row2bindb_line(numpy_row):
        """
        Convert row of a numpy matrix with ngrams and counts to a BinDB line.
        """
        return bindb.BinDBLine(tuple(numpy_row)[:-1], numpy_row["count"])

    return integrate_counts(map(numpy_row2bindb_line, mgrams), bindb.BinDBLine)
def process_file(n):
    """Create a counts consistent BinDB table of order n."""
    ngrams_filename = "{n}gram".format(**locals())
    ngrams_input_path = os.path.join(args.input, ngrams_filename)
    ngrams_output_path = os.path.join(args.output, ngrams_filename)

    # The highest order table is consistent by definition
    if n == args.n_max:
        print_status("Copying", ngrams_input_path, "to", ngrams_output_path)
        shutil.copyfile(ngrams_input_path, ngrams_output_path)
    else:
        print_status("Creating counts-consistent {n}gram BinDB file".format(
            **locals()))

        # We need to use the already consistent table, hence reading ograms from
        # theoutput directory
        ograms_filename = "{}gram".format(n+1)
        ograms_path = os.path.join(args.output, ograms_filename)

        with open(ograms_path, "rb") as ograms_f, \
             open(ngrams_input_path, "rb") as ngrams_input_f, \
             open(ngrams_output_path, "wb") as ngrams_output_f:

            ograms = bindb.iter_bindb_file(ograms_f, n+1)

            # Make iterators over left and right integrated ograms
            left_integrated_ograms = integrate_counts(
                map(drop_last_token, ograms), bindb.BinDBLine
            )
            right_integrated_ograms = right_integrate_counts(ograms_path, n+1)

            # Maximise counts of left and right integrated ograms
            integrated_ograms = maximise_counts(
                left_integrated_ograms, right_integrated_ograms, bindb.BinDBLine
            )

            # Maximise counts of ngrams and integrated ograms
            ngrams = bindb.iter_bindb_file(ngrams_input_f, n)
            maximised_ngrams = maximise_counts(integrated_ograms, ngrams,
                                               bindb.BinDBLine)

            for l in maximised_ngrams:
                ngrams_output_f.write(bindb.pack_line(l, n))

    print_status("Saved counts-consistent {n}gram BinDB file "
                 "to".format(**locals()), ngrams_output_path)
Example #4
0
from pysteg.googlebooks.ngrams_analysis import text2token_strings

# Define and parse the script arguments
parser = argparse.ArgumentParser(description=descr)
parser.add_argument("-i",
                    "--index",
                    help="represent tokens using their indices")
parser.add_argument("-n",
                    "--normalise",
                    action="store_true",
                    help="normalise and explode tokens")
args = parser.parse_args()

# Load the index
if args.index:
    print_status("Started loading index from", args.index)
    with open(args.index, "r") as f:
        index = bindb.BinDBIndex(f)
    print_status("Finished loading index")

while True:
    try:
        text = input('--> ')
    except KeyboardInterrupt:
        print()
        break

    token_strings = text2token_strings(text)

    if args.normalise:
        token_strings = normalise_and_explode_tokens(token_strings)
Example #5
0
def process_file(n, prefix):
    """
    Process a single file. Since ngrams will change size and partition, they
    will be appended to existing files containing ngram counts from other prefix
    files. As a result, changes introduces by partial processing of a file
    cannot be rolled back easily -- there is no progress tracking, the whole
    script needs to be restarted from scratch if interrupted midway.
    """

    filename = ngram_filename(n, prefix)
    path = os.path.join(args.input, filename)

    print_status("Processing", filename)

    # Dictionary of all possible output files
    out = dict()

    with open(path, "r") as i:
        for line in i:
            l_original = line.split("\t")

            # Normalise and explode original tokens
            l = tuple(normalise_and_explode_token(t) for t in l_original[:-1])

            # Count the exploded size of each original token
            s = tuple(len(t) for t in l)

            # Discard ngrams with empty original edge tokens - a lower order
            # ngram already handles these counts
            if s[0] == 0 or s[-1] == 0:
                continue

            # There are at least two original tokens, so both edge tokens exist
            if n >= 2:
                # Count the total exploded size of middle original tokens, these
                # have to be included in the output
                middle_s = sum(s[1:-1])

                # Count the maximum number of normalised tokens that can come
                # from the original edge tokens
                max_edge_s = args.n_max - middle_s

                # There are too many exploded middle tokens -- the normalised
                # ngram including at least one normalised token from each
                # original edge token would be beyond the order of the model
                if max_edge_s < 2:
                    continue

                # Flatten the original middle tokens
                l_middle = tuple(itertools.chain.from_iterable(l[1:-1]))

                # Consider every combination of normalised edge tokens -- they
                # need to be adjacent to the middle tokens
                for ls in range(1,min(max_edge_s,s[0])+1):
                    for rs in range(1,min(max_edge_s-ls,s[-1])+1):
                        output_ngram(l[0][-ls:] + l_middle + l[-1][:rs],
                                     l_original[-1], out)

            # There is only one original token
            else:
                for start in range(s[0]):
                    for stop in range(start+1, min(start+args.n_max,s[0])+1):
                        output_ngram(l[0][start:stop], l_original[-1], out)

    close_output_files(out)

    print_status("Finished", filename)
def write_ngrams_table(n, prefixes):
    """Writes ngrams counts table for a particular n."""
    def pref_path(pref):
        """Give path to a prefix file."""
        return os.path.join(args.input, ngram_filename(n, pref))

    # Prepare a part2pref dictionary of prefixes corresponding to partitions
    part2pref = {part: set() for part in BS_PARTITION_NAMES}
    for pref in prefixes:
        # Determine which prefix files actually exist. This introduces a race
        # condition, however the assumption is that database will not be
        # modified while this script is running.
        if os.path.exists(pref_path(pref)):
            if pref in BS_SPECIAL_PREFIXES:
                part2pref["_"].add(pref)
            else:
                part2pref[pref[0]].add(pref)

    # Format specifier for a line of the bindb file
    fmt = bindb.fmt(n)

    # Format specifier for the numpy matrix used for sorting the ngrams
    dtp = (
        # n * little-endian 4 byte integers with token indices
        [("w{}".format(i), "<i4") for i in range(n)] +
        # little-endian 8 byte integer with ngram count
        [("f", "<i8")])

    # Create the bindb file
    output_path = os.path.join(args.output, "{n}gram".format(**locals()))
    with open(output_path, "wb") as fo:
        # Go over the prefix files for each possible partitions
        for part in BS_PARTITION_NAMES:
            # Sort the set of prefixes which will contribute to this partition
            # to take advantage of partial sorting (ngrams belonging to the same
            # prefix will still be adjacent in the sorted partition)
            prefs = sorted(part2pref[part])

            # Calculate the maximum number of ngrams in the partition by
            # counting total number of lines in each prefix file
            ngrams_maxn = sum(
                sum(1 for line in open(pref_path(pref), "r"))
                for pref in prefs)

            # Create a numpy array that can contain all potential ngrams
            ngrams = zeros(ngrams_maxn, dtype=dtp)

            # Read one by one prefix files corresponding to the partition
            i = 0
            for pref in prefs:
                # Simultaneously read ngrams from the prefix file and write
                # those which don't match to the error file
                filename = ngram_filename(n, pref)
                input_path = os.path.join(args.input, filename)
                error_path = os.path.join(args.error, filename)
                with open(input_path, "r") as fi, open(error_path, "w") as fe:
                    for line in fi:
                        ngram = line[:-1].split("\t")
                        try:
                            # Translate all tokens to their indices
                            ixs = tuple(map(index.s2i, ngram[:-1]))
                            # Assert that the partition is correct
                            assert (index.s2p(ngram[0]) == part)
                            # Add the ngram
                            ngrams[i] = ixs + (int(ngram[-1]), )
                            i += 1
                        # If the partition doesn't match or the token cannot be
                        # found in the index
                        except (AssertionError, KeyError):
                            fe.write(line)
                print_status("Read and indexed ngrams from", input_path)
            ngrams_n = i

            # Sort the partition
            ngrams = ngrams[:ngrams_n]
            ngrams.sort(order=["w{}".format(i) for i in range(n)])
            print_status(ngrams_n, "ngrams sorted")

            # Write lines to the binary counts file
            out_count = 0
            current_ngram = tuple()
            current_f = 0
            for i in range(ngrams_n):
                ngram_i = tuple(ngrams[i])[:-1]

                # Compare this ngram to the currently deduplicated ngram
                if ngram_i == current_ngram:
                    current_f += ngrams[i]["f"]
                else:
                    if i != 0:
                        fo.write(
                            struct.pack(fmt, *current_ngram + (current_f, )))
                        out_count += 1
                    current_ngram = ngram_i
                    current_f = ngrams[i]["f"]

                # Write a line in the last loop iteration
                if i == ngrams_n - 1:
                    fo.write(struct.pack(fmt, *current_ngram + (current_f, )))
                    out_count += 1

            print_status(out_count, "ngrams integrated and saved to",
                         output_path)