Beispiel #1
0
    def __init__(self, dictionary_path=None):

        if dictionary_path is None:
            self.dictionary = None
        else:
            self.dictionary = t4k.UnigramDictionary()
            self.dictionary.load(dictionary_path)
def generate_top_random_task(out_path, num_top, num_rand):

    # Open the file that we'll write to
    out_file = open(out_path, 'w')

    # First, figure out what words have already been annotated
    results = t4k.CrowdflowerResults(RESULTS_PATHS)
    already_annotated = set([row['data']['token'] for row in results])

    # Now get a sorted dictionary for gigaword
    dictionary = t4k.UnigramDictionary()
    dictionary.load(DICTIONARY_PATH)

    # Get the top ``num_top`` words that haven't yet been annotated
    top_words = set()
    for token in dictionary.get_token_list():

        # Skip the UNK token
        if token is 'UNK':
            continue

        # Add words that haven't been annotated before
        if token not in already_annotated:
            top_words.add(token)

        # Stop once we have enough words
        if len(top_words) >= num_top:
            break

    # Now, get ``num_rand`` uniformly randomly selected words that have not
    # been annotated.  Candidates include any non 'UNK' word that hasn't been
    # annotated before.
    candidates = set(list(dictionary.get_token_list())[1:]) - already_annotated
    rand_words = set(random.sample(candidates, num_rand))

    # We're almost ready to start writing out to file.  Let's make a list of
    # all the rows so that we can randomly shuffle them before writing to file.
    rows = []
    for word in rand_words | top_words:
        if word in rand_words and word in top_words:
            rows.append((word, 'top:rand'))
        elif word in rand_words:
            rows.append((word, 'rand'))
        else:
            rows.append((word, 'top'))
    random.shuffle(rows)

    # Write out the headings, then write the rows
    writer = csv.writer(out_file)
    writer.writerow(('token', 'source'))
    writer.writerows(rows)
Beispiel #3
0
    def assert_feature_like_on_disc(self, feature_accumulator, path):

        # Test that the dictionary extracted is the same as the ones on disk
        expected_dictionary = t4k.UnigramDictionary()
        expected_dictionary.load(os.path.join(path, 'dictionary'))
        self.assertEqual(
            dict(feature_accumulator.dictionary.get_frequency_list()),
            dict(expected_dictionary.get_frequency_list())
        )

        # Test that the features extracted are the same as the ones on disk
        for feature_type in ['dependency', 'baseline', 'hand_picked']:
            expected = json.loads(
                open(os.path.join(path, feature_type + '.json')).read())
            self.assertDictEqual(
                getattr(feature_accumulator, feature_type), expected
            )
def get_top_words():
    """
    Get the k most common words in gigawords for which all words were annotated
    and k is as big as it can be.
    """
    all_annotated_words = get_all_annotated_words()
    dictionary = t4k.UnigramDictionary()
    dictionary.load(DICTIONARY_DIR)
    top_words = []
    for i, token in enumerate(dictionary.get_token_list()):
        if token == 'UNK':
            continue
        if token in all_annotated_words:
            top_words.append(token)
        else:
            break

    return top_words
Beispiel #5
0
def generate_random_candidates(num_to_generate, out_path, exclude=set()):

    # Open a path that we want to write to
    out_f = open(out_path, 'w')

    # Open the dictionary of words seen in the corpus
    dictionary_path = os.path.join(BEST_WORDNET_ONLY_FEATURES_PATH,
                                   'dictionary')
    dictionary = t4k.UnigramDictionary()
    dictionary.load(dictionary_path)

    # Uniformly randomly sample from it
    samples = set()
    while len(samples) < num_to_generate:
        token = random.choice(dictionary.token_map.tokens)
        if token != 'UNK' and token not in exclude and token not in samples:
            samples.add(token)

    out_f.write('\n'.join(samples))
Beispiel #6
0
def get_dictionary(path):
    dictionary = t4k.UnigramDictionary()
    dictionary.load(path)
    return dictionary