Ejemplo n.º 1
0
def search_tokens():
    n = int(sys.argv[2])
    tokenizer = Tokenizer()
    for query in interactive_sequence_generator():
        if query.startswith(' '):
            query_token = Token(query[1:], True)
        else:
            query_token = Token(query, False)
        for sequence in Wikipedia.training_sequences(n):
            tokens = tokenizer.tokenize(sequence)
            if query_token in tokens:
                print(sequence)
        predicted = reinsert_punctuation(segmented, sequence)
        #print(predicted)
        predicted = self.postprocessor.correct(predicted)
        return predicted


if __name__ == "__main__":
    if len(sys.argv) > 1:
        benchmark_name = sys.argv[1]
        subset = SUBSETS[sys.argv[2]]
        benchmark = Benchmark(benchmark_name, subset)
        sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
        writer = PredictionsFileWriter(benchmark.get_results_directory() +
                                       "wordsegment.txt")
    else:
        sequences = interactive_sequence_generator()
        writer = None

    segmenter = WordSegment()

    for s_i, sequence in enumerate(sequences):
        start_time = timestamp()
        try:
            predicted = segmenter.correct(sequence)
        except RecursionError:
            predicted = sequence
        runtime = time_diff(start_time)
        print(predicted)
        if writer is not None:
            writer.add(predicted, runtime)
Ejemplo n.º 3
0
    """Roachdale is a town in Franklin and Jackson townships, Putnam County, in the U.S. state of Indiana. The population was 926 at the 2010 census.""",
    """He works for Prof. Dr. Prename Lastname.""",
    """She was entitled Dr. Prename Lastname.""",
    """He is born in Washington D.C. in the U.S.A. and lived there.""",
    """I did three things, e.g. one thing.""",
    """I did more, e. g. another thing.""",
    """Read sentences, i.e. this sentence.""",
    """She met Mr. Lastname and Mrs. Lastname at their house.""",
    """The vote elected Mr. Lastname as president.""",
    """The vote elected Mrs. Lastname as president.""",
    """Prename Lastname (ca. 1950-2000) lived."""
]

if __name__ == "__main__":
    if "i" in sys.argv:
        paragraphs = interactive_sequence_generator()
    elif "t" in sys.argv:
        paragraphs = []
        wiki_paragraphs = read_sequences(paths.WIKI_TRAINING_PARAGRAPHS)
        for _ in range(1000):
            paragraphs.append(next(wiki_paragraphs))
    else:
        paragraphs = test_sequences

    if "spacy" in sys.argv:
        splitter = SpacySentenceSplitter()
    elif "wiki" in sys.argv:
        print("loading wiki punkt tokenizer...")
        splitter = WikiPunktTokenizer()
    else:
        splitter = NLTKSentenceSplitter()
Ejemplo n.º 4
0
def query_unigrams():
    holder = UnigramHolder()
    for query in interactive_sequence_generator():
        print(holder.get(query))
import project
from src.postprocessing.bigram import BigramPostprocessor
from src.interactive.sequence_generator import interactive_sequence_generator

if __name__ == "__main__":
    postprocessor = BigramPostprocessor()
    print("%i unigrams" % len(postprocessor.unigrams))
    print("%i bigrams" % len(postprocessor.bigrams))
    for sequence in interactive_sequence_generator():
        predicted = postprocessor.correct(sequence)
        print(predicted)
Ejemplo n.º 6
0
import sys

from project import src
from src.helper.pickle import load_object
from src.settings import paths
from src.helper.data_structures import sort_dict_by_value
from src.interactive.sequence_generator import interactive_sequence_generator

if __name__ == "__main__":
    frequencies = load_object(paths.CHARACTER_FREQUENCY_DICT)
    sorted_frequencies = sort_dict_by_value(frequencies)
    ranks = {char: i for i, (char, frequency) in enumerate(sorted_frequencies)}

    if len(sys.argv) > 1:
        print_top_n = int(sys.argv[1])
        for i, (frequency,
                char) in enumerate(sorted_frequencies[:print_top_n]):
            print(i, char, frequency)
    else:
        for char in interactive_sequence_generator():
            freq = frequencies[char]
            rank = ranks[char]
            print("rank %i (frequency %i)" % (rank, freq))