Esempio n. 1
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Esempio n. 2
0
def read_mturk():
    lang1, lang2 = 'en', 'en'
    with open(get_support_data_filename('mturk/MTURK-771.csv')) as file:
        for line in file:
            term1, term2, sscore = line.split(',')
            gold_score = float(sscore)
            yield term1, term2, gold_score, lang1, lang2
Esempio n. 3
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Esempio n. 4
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Esempio n. 5
0
def read_turk_answers_semeval2012(subset, subclass, test_questions):
    """
    A line represents one turker's answer to a given question. An answer has the
    following format:
    pair1, pair2, pair3, pair4, least_prototypical_pair, most_prototypical_pair, relation_name

    This function returns two dictionaries:
      * pairqnum2least -
      * pairqnum2most
    """
    filename = 'semeval12-2/{}/Phase2Answers-{}.txt'.format(subset, subclass)
    with open(get_support_data_filename(filename)) as file:
        answers = []
        for i, line in enumerate(file):
            if i == 0:
                continue
            pairs = tuple(line.split('\t'))
            answers.append(pairs)

        pairqnum2least = defaultdict(int)
        pairqnum2most = defaultdict(int)

        for question, answers in groupby(answers, key=lambda x: x[:4]):
            question_num = test_questions.index(question)
            for answer in answers:
                pairqnum2least[(question_num, answer[4])] += 1
                pairqnum2most[(question_num, answer[5])] += 1
        return pairqnum2least, pairqnum2most
Esempio n. 6
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename(
        'story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high],
                     index=['acc', 'low', 'high'])
def combine_assertions(input_filename, core_filename, output_filename):
    """
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
    Output a msgpack stream of assertions the file indicated by
    `output_filename`.

    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined.

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """

    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

    out = MsgpackStreamWriter(output_filename)
    out_bad = MsgpackStreamWriter(output_filename + '.reject')

    core_prefixes = set()
    for line in open(core_filename, encoding='utf-8'):
        core_prefixes.add(uri_prefix(line.strip(), 3))

    # Scan through the assertions twice to add derived words to the blocklist
    blocklist = Blocklist.load(get_support_data_filename(BLOCK_FILENAME))
    for iter in range(2):
        with open(input_filename, encoding='utf-8') as stream:
            for line in stream:
                tmp_assertion = _make_assertion([line.strip()])
                if tmp_assertion is None:
                    continue
                blocklist.propagate_blocks(tmp_assertion)

    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = _make_assertion(line_group)
            destination = out
            if assertion is None:
                continue
            if assertion['weight'] <= 0:
                destination = out_bad
            if blocklist.is_blocked(assertion):
                destination = out_bad
            if assertion['rel'] == 'ExternalURL':
                # discard ExternalURL edges for things that aren't otherwise
                # in ConceptNet
                prefix = uri_prefix(assertion['start'], 3)
                if prefix not in core_prefixes:
                    destination = out_bad
            destination.write(assertion)

    out.close()
    out_bad.close()
Esempio n. 8
0
def read_mc():
    """
    Parses the Miller and Charles word similarity test collection.
    """
    filename = get_support_data_filename('mc/EN-MC-30.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 9
0
def read_ws353_multilingual(language):
    if language == 'es':
        language = 'es.fixed'
    filename = 'wordsim-353/{}.tab'.format(language)
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            term1, term2, sscore = line.split('\t')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 10
0
def read_rg65():
    """
    Parses the Rubenstein and Goodenough word similarity test collection.
    """
    filename = get_support_data_filename('rg65/EN-RG-65.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 11
0
def read_rw(subset='dev'):
    """
    Parses the rare word similarity test collection.
    """
    filename = get_support_data_filename('rw/rw-{}.csv'.format(subset))
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 12
0
def read_mc():
    """
    Parses the Miller and Charles word similarity test collection.
    """
    filename = get_support_data_filename('mc/EN-MC-30.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 13
0
def read_rw():
    """
    Parses the rare word similarity test collection.
    """
    filename = get_support_data_filename('rw/rw.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 14
0
def read_ws353_multilingual(language):
    if language == 'es':
        language = 'es.fixed'
    filename = 'wordsim-353/{}.tab'.format(language)
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            term1, term2, sscore = line.split('\t')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 15
0
def read_rw():
    """
    Parses the rare word similarity test collection.
    """
    filename = get_support_data_filename('rw/rw.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 16
0
def read_rg65():
    """
    Parses the Rubenstein and Goodenough word similarity test collection.
    """
    filename = get_support_data_filename('rg65/EN-RG-65.txt')
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 17
0
def read_simlex():
    lang1, lang2 = 'en', 'en'
    with open(get_support_data_filename('simlex/SimLex-999.txt')) as file:
        for line in file:
            if line.startswith("word1"):
                continue
            term1, term2, _, sscore, _, _, _, ascore, _, _ = line.split('\t')
            gold_score = float(sscore)
            yield term1, term2, gold_score, lang1, lang2
Esempio n. 18
0
def read_rw(subset='dev'):
    """
    Parses the rare word similarity test collection.
    """
    filename = get_support_data_filename('rw/rw-{}.csv'.format(subset))
    with open(filename) as file:
        for line in file:
            parts = line.split()
            yield parts[0], parts[1], float(parts[2])
Esempio n. 19
0
def read_semeval_monolingual(lang, subset='test'):
    """
    Parses Semeval2017-Task2 monolingual word similarity (subtask 1) test collection.
    """
    lang1, lang2 = lang, lang
    filename = get_support_data_filename('semeval17-2/{}.{}.txt'.format(lang, subset))
    with open(filename) as file:
        for line in file:
            parts = line.split('\t')
            yield parts[0], parts[1], float(parts[2]), lang1, lang2
Esempio n. 20
0
def read_pku500():
    lang1, lang2 = 'zh', 'zh'
    filename = 'pku-500/pku-500.csv'
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            if line.startswith('#'):
                continue
            term1, term2, sscore = line.split('\t')
            gold_score = float(sscore)
            yield term1, term2, gold_score, lang1, lang2
Esempio n. 21
0
def read_rw():
	"""
	Parses the rare word similarity test collection.
	"""
	G = Graph()
	filename = get_support_data_filename('rw/rw.txt')
	with open(filename) as file:
		for line in file:
			parts = line.split()
			G.add_edge(parts[0], parts[1], weight=(float(parts[2])/10))
Esempio n. 22
0
def read_mc():
	"""
	Parses the Miller and Charles word similarity test collection.
	"""
	G=Graph()
	filename = get_support_data_filename('mc/EN-MC-30.txt')
	with open(filename) as file:
		for line in file:
			parts = line.split()
			G.add_edge(parts[0], parts[1], weight=float(parts[2]))
Esempio n. 23
0
def read_gurevych(setname):
    # The 'setname' here is a number indicating the number of word pairs
    # in the set.
    filename = 'gurevych/wortpaare{}.gold.pos.txt'.format(setname)
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            if line.startswith('#'):
                continue
            term1, term2, sscore, _pos1, _pos2 = line.rstrip().split(':')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 24
0
def read_gurevych(setname):
    # The 'setname' here is a number indicating the number of word pairs
    # in the set.
    filename = 'gurevych/wortpaare{}.gold.pos.txt'.format(setname)
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            if line.startswith('#'):
                continue
            term1, term2, sscore, _pos1, _pos2 = line.rstrip().split(':')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 25
0
def read_rg65():
	"""
	Parses the Rubenstein and Goodenough word similarity test collection.
	"""
	G = Graph()
	filename = get_support_data_filename('rg65/EN-RG-65.txt')
	with open(filename) as file:
		for line in file:
			parts = line.split()
			G.add_edge(parts[0], parts[1], weight=float(parts[2]))
	return G
Esempio n. 26
0
def read_symrel():
    """
    Parses the symantic analogy relations from Mikolov et al.
    """
    filename = get_support_data_filename('rel/questions-words.txt')
    with open(filename) as file:
        for line in file:
            if line.startswith(': gram'):
                break
            if line.startswith(':'):
                continue
            yield line.split()
Esempio n. 27
0
def read_test_questions_semeval2012(subset, subclass):
    """
    Read test questions for a specific subclass. A test question has the following format:
    pair1,pair2,pair3,pair4
    """
    filename = 'semeval12-2/{}/Phase2Questions-{}.txt'.format(subset, subclass)
    with open(get_support_data_filename(filename)) as file:
        test_questions = []
        for line in file:
            pairs = tuple(line.strip().split(','))
            test_questions.append(pairs)
        return test_questions
Esempio n. 28
0
def read_symrel():
    """
    Parses the symantic analogy relations from Mikolov et al.
    """
    filename = get_support_data_filename('rel/questions-words.txt')
    with open(filename) as file:
        for line in file:
            if line.startswith(': gram'):
                break
            if line.startswith(':'):
                continue
            yield line.split()
Esempio n. 29
0
def _setup():
    """
    Read the dictionary file, creating a mapping from words to their
    phonetics.

    When multiple pronunciations are given, keep the last one.
    """
    with open(get_support_data_filename('cmudict.0.7a')) as rhymelist:
        for line in rhymelist:
            if line.startswith(';;;'): continue
            word, phon = line.strip().split('  ')
            phon = phon.split(' ')
            PHONETIC_DICT[word] = phon
Esempio n. 30
0
def _setup():
    """
    Read the dictionary file, creating a mapping from words to their
    phonetics.

    When multiple pronunciations are given, keep the last one.
    """
    with open(get_support_data_filename('cmudict.0.7a')) as rhymelist:
        for line in rhymelist:
            if line.startswith(';;;'): continue
            word, phon = line.strip().split('  ')
            phon = phon.split(' ')
            PHONETIC_DICT[word] = phon
Esempio n. 31
0
def read_jsim():
    """
    Read the Japanese rare-words dataset from Tokyo Metropolitan University.
    """
    lang1, lang2 = 'ja', 'ja'
    for pos in ('noun', 'verb', 'adj', 'adv'):
        filename = get_support_data_filename('jSIM/similarity_full/score_{}_new_full.csv'.format(pos))
        with open(filename, encoding='utf-8') as file:
            for line in file:
                if line.startswith('word1'):
                    continue
                parts = line.split(',')
                yield parts[0].strip(), parts[1].strip(), float(parts[2]), lang1, lang2
Esempio n. 32
0
def read_train_pairs_semeval2012(subset, subclass):
    """
    Read a set of three training pairs for a given subclass. These pairs are
    used as prototypical examples of a given relation to which test pairs are compared.
    """
    filename = 'semeval12-2/{}/Phase1Questions-{}.txt'.format(subset, subclass)
    with open(get_support_data_filename(filename)) as file:
        train_pairs = []
        for i, line in enumerate(file):
            if i in [4, 5, 6]:
                pair = line.strip().split(':')
                pair = tuple(pair)
                train_pairs.append(pair)
    return train_pairs
Esempio n. 33
0
def read_bats(category):
    """
    Read BATS dataset pairs for a specific category. Turn them into questions.

    For some questions, BATS contains multiple answers. For example, the answer to an
    analogy question Nicaragua:Spanish::Switzerland:? could be German, French, or Italian. These
    will all be supplied as a list if they are an answer (b2). However, if they are a part of a
    question (b1), only the first one will be used.
    """
    filename = 'bats/{}.txt'.format(category)
    pairs = []
    with open(get_support_data_filename(filename)) as file:
        for line in file:
            if '\t' in line:
                left, right = line.lower().split('\t')
            else:
                left, right = line.lower().split()
            right = right.strip()
            if '/' in right:
                right = [i.strip() for i in right.split('/')]
            else:
                right = [i.strip() for i in right.split(',')]
            pairs.append([left, right])

    quads = []
    for i in range(len(pairs)):
        first_pair = pairs[i]
        first_pair[1] = first_pair[1][
            0]  # select only one term for b1, even if more may be available
        second_pairs = [pair for j, pair in enumerate(pairs) if j != i]
        for second_pair in second_pairs:
            quad = []

            # the first three elements of a quad are the two terms in first_pair and the first
            # term of the second_pair
            quad.extend([
                standardized_uri('en', term)
                for term in first_pair + second_pair[:1]
            ])

            # if the second element of the second pair (b2) is a list, it means there are multiple
            # correct answers for b2. We want to keep all of them.
            if isinstance(second_pair[1], list):
                quad.append(
                    [standardized_uri('en', term) for term in second_pair[1]])
            else:
                quad.append(standardized_uri('en', second_pair[1]))
            quads.append(quad)
    return quads
Esempio n. 34
0
def read_men3000():
    """
    Parses the MEN test collection. MEN is a collection of 3000 english word
    pairs, each with a relatedness rating between 0 and 50. The relatedness of
    a pair of words was determined by the number of times the pair was selected
    as more related compared to another randomly chosen pair.
    """
    filename = get_support_data_filename('mensim/MEN_dataset_lemma_form.dev')
    with open(filename) as file:
        for line in file:
            parts = line.rstrip().split()
            term1 = parts[0].split('-')[0]  # remove part of speech
            term2 = parts[1].split('-')[0]  # as above
            gold_score = float(parts[2])
            yield term1, term2, gold_score
Esempio n. 35
0
def read_ws353():
    """
    Parses the word-similarity 353 test collection (ws353). ws353 is a
    collection of 353 english word pairs, each with a relatedness rating between
    0 (totally unrelated) to 10 (very related or identical). The relatedness
    of a pair of words was determined by the average scores of either 13
    or 16 native english speakers.
    """
    with open(get_support_data_filename('wordsim-353/combined.csv')) as file:
        for line in file:
            if line.startswith('Word 1'): # Skip the header
                continue
            term1, term2, sscore = line.split(',')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 36
0
def read_men3000():
    """
    Parses the MEN test collection. MEN is a collection of 3000 english word
    pairs, each with a relatedness rating between 0 and 50. The relatedness of
    a pair of words was determined by the number of times the pair was selected
    as more related compared to another randomly chosen pair.
    """
    filename = get_support_data_filename('mensim/MEN_dataset_lemma_form.dev')
    with open(filename) as file:
        for line in file:
            parts = line.rstrip().split()
            term1 = parts[0].split('-')[0] # remove part of speech
            term2 = parts[1].split('-')[0] # as above
            gold_score = float(parts[2])
            yield term1, term2, gold_score
Esempio n. 37
0
def read_ws353():
    """
    Parses the word-similarity 353 test collection (ws353). ws353 is a
    collection of 353 english word pairs, each with a relatedness rating between
    0 (totally unrelated) to 10 (very related or identical). The relatedness
    of a pair of words was determined by the average scores of either 13
    or 16 native english speakers.
    """
    with open(get_support_data_filename('wordsim-353/combined.csv')) as file:
        for line in file:
            if line.startswith('Word 1'):  # Skip the header
                continue
            term1, term2, sscore = line.split(',')
            gold_score = float(sscore)
            yield term1, term2, gold_score
Esempio n. 38
0
def read_jsim():
    """
    Read the updated Japanese rare-words dataset from Karpinska et al.
    (http://www.aclweb.org/anthology/W18-2905)
    """
    lang1, lang2 = 'ja', 'ja'
    for pos in ('noun', 'verb', 'adj', 'adv'):
        filename = get_support_data_filename(
            'jSIM/similarity_full/score_{}_new_full.csv'.format(pos))
        with open(filename, encoding='utf-8') as file:
            for line in file:
                if line.startswith('word1'):
                    continue
                parts = line.split(',')
                yield parts[0].strip(), parts[1].strip(), float(
                    parts[2]), lang1, lang2
Esempio n. 39
0
def eval_google_analogies(vectors, subset='semantic', vocab_size=200000, verbose=False):
    """
    Evaluate the Google Research analogies, released by Mikolov et al. along
    with word2vec.

    These analogies come in two flavors: semantic and syntactic. Numberbatch
    is intended to be a semantic space, so we focus on semantic analogies.

    The syntactic analogies are about whether you can inflect or conjugate a
    particular word. The semantic analogies are about whether you can sort
    words by their gender, and about geographic trivia.

    I (Rob) think this data set is not very representative, but evaluating
    against it is all the rage.
    """
    filename = get_support_data_filename('google-analogies/{}-words.txt'.format(subset))
    quads = read_google_analogies(filename)
    return eval_open_vocab_analogies(vectors, quads, vocab_size, verbose)
Esempio n. 40
0
def read_turk_ranks_semeval2012(subset, subclass):
    """
    Read gold rankings of prototypicality, as computed using turkers answers to MaxDiff
    questions.

    A score is defined as the difference between the number of times the turkers judged
    a pair the most prototypical and the number of times they judged it as the least
    prototypical.
    """
    filename = 'semeval12-2/{}/GoldRatings-{}.txt'.format(subset, subclass)
    with open(get_support_data_filename(filename)) as file:
        gold_ranks = []
        for line in file:
            if line.startswith('#'):
                continue
            gold_score, pair = line.split()
            gold_score = float(gold_score)
            gold_ranks.append((pair, gold_score))
        return sorted(gold_ranks)
Esempio n. 41
0
'French'
>>> CODE_TO_NAME['en']['fra']
'French'
>>> NAME_TO_CODE['en']['French']
'fr'
>>> NAME_TO_CODE['en']['Mandarin']
'cmn'
>>> NAME_TO_CODE['de']['Dutch']
'ndl'
"""

from conceptnet5.util import get_support_data_filename
import codecs
import re

ISO_DATA_FILENAME = get_support_data_filename('iso639-enfrde.txt')

CODE_TO_NAME = {'en': {}, 'de': {}, 'fr': {}}
NAME_TO_CODE = {'en': {}, 'de': {}, 'fr': {}}

# The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the
# browsable Web interface.
#
# This might be too many.
SUPPORTED_LANGUAGE_CODES = [
    'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az',
    'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce',
    'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz',
    'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo',
    'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho',
    'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io',
Esempio n. 42
0
# coding: utf-8
from __future__ import unicode_literals
from conceptnet5.util import get_support_data_filename
from conceptnet5.builders.index_assertions import index_assertions
from nose.tools import eq_
from conceptnet5.api import app, configure_api
import os
import json


TESTDATA_DIR = get_support_data_filename("testdata")
ASSERTIONS_DIR = os.path.join(TESTDATA_DIR, 'input/assertions')
DB_PATH = os.path.join(TESTDATA_DIR, 'output/assertions.db')
ASSOC_DIR = os.path.join(TESTDATA_DIR, 'input/assoc_space')
SPANISH_EXAMPLE = '/a/[/r/RelatedTo/,/c/es/verbigracia/n/,/c/en/example/]'
CLIENT = None


def setup():
    global CLIENT
    index_assertions(ASSERTIONS_DIR, DB_PATH, input_shards=1, output_shards=1)
    configure_api(DB_PATH, ASSERTIONS_DIR, ASSOC_DIR, nshards=1)
    CLIENT = app.test_client()


def teardown():
    os.unlink(DB_PATH + '.0')


def uris(response):
    assertions = response['edges']
Esempio n. 43
0
def setUp():
    global context
    context_filename = get_support_data_filename('ld/context.ld.json')
    context = json.load(open(context_filename))
Esempio n. 44
0
import json

from pyld import jsonld

from conceptnet5.api import lookup_grouped_by_feature, lookup_paginated
from conceptnet5.util import get_support_data_filename
from conceptnet5.tests.conftest import run_build

CONTEXT = json.load(open(get_support_data_filename('ld/context.ld.json')))


def flat_map(response):
    """
    Transform a response using JSON-LD's "flatten" operation, and return a
    dictionary mapping resources (as fully-qualified URLs) to their values
    (also containing fully-qualified URLs).
    """
    # The URL in '@context' may not be available yet, because we probably
    # haven't deployed. So replace the response's "@context" with the
    # contents of that file.
    response['@context'] = CONTEXT['@context']

    # jsonld.flatten gives us a list of objects, which all have @id values
    # (unless they're awkward "blank nodes", like definitions of features).
    # The @id values are unique after flattening, so we can make a dictionary
    # keyed by them.
    result = {}
    flat_objects = jsonld.flatten(response)
    for obj in flat_objects:
        if '@id' in obj:
            result[obj['@id']] = obj
Esempio n. 45
0
def read_mturk():
    with open(get_support_data_filename('mturk/MTURK-771.csv')) as file:
        for line in file:
            term1, term2, sscore = line.split(',')
            gold_score = float(sscore)
            yield term1, term2, gold_score
def get_blacklist():
    filename = get_support_data_filename('blacklist.txt')
    return set(open(filename).readlines())
Esempio n. 47
0
'French'
>>> CODE_TO_NAME['en']['fra']
'French'
>>> NAME_TO_CODE['en']['French']
'fr'
>>> NAME_TO_CODE['en']['Mandarin']
'cmn'
>>> NAME_TO_CODE['de']['Dutch']
'ndl'
"""

from conceptnet5.util import get_support_data_filename
import codecs
import re

ISO_DATA_FILENAME = get_support_data_filename('iso639-enfrde.txt')

CODE_TO_NAME = {'en': {}, 'de': {}, 'fr': {}}
NAME_TO_CODE = {'en': {}, 'de': {}, 'fr': {}}

# The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the
# browsable Web interface.
#
# This might be too many.
SUPPORTED_LANGUAGE_CODES = [
    'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay',
    'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca',
    'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv',
    'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj',
    'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi',
    'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik',
Esempio n. 48
0
# coding: utf-8
from __future__ import unicode_literals
from conceptnet5.wiktparse.rules import EnWiktionarySemantics
from conceptnet5.util import get_support_data_filename
from nose.tools import eq_
import os

TESTDATA_DIR = get_support_data_filename("testdata")


def data_path(filename):
    return os.path.join(TESTDATA_DIR, filename)


ENTRY = {
    'site':
    'en.wiktionary.org',
    'sections': [{
        'sections': [],
        'text': '*[[odečítat]]',
        'heading': 'Alternative forms'
    }, {
        'sections': [{
            'sections': [],
            'text': '{{cs-conj-at|odčít}}',
            'heading': 'Conjugation'
        }, {
            'sections': [],
            'text': '* [[sčítat]]',
            'heading': 'Antonyms'
        }, {
Esempio n. 49
0
from __future__ import unicode_literals
import codecs
import json
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.nodes import standardized_concept_uri
from conceptnet5.edges import make_edge
from conceptnet5.util import get_support_data_filename
from conceptnet5.uri import Licenses

FRAME_DATA = json.load(
    codecs.open(get_support_data_filename('zh_frames.json'), encoding='utf-8'))


def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace(
        '{2}', '[[' + concept2 + ']]')
    # We mark surface texts with * if {2} comes before {1}.
    if ftext.find('{2}') < ftext.find('{1}'):
        surfaceText = '*' + surfaceText

    start = standardized_concept_uri('zh_TW', concept1)
    end = standardized_concept_uri('zh_TW', concept2)
    source = {
        'contributor': '/s/contributor/petgame/' + user,
        'activity': '/s/activity/ptt/petgame'
Esempio n. 50
0
from __future__ import unicode_literals
import codecs
import json
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.nodes import standardized_concept_uri
from conceptnet5.edges import make_edge
from conceptnet5.util import get_support_data_filename
from conceptnet5.uri import Licenses


FRAME_DATA = json.load(
    codecs.open(get_support_data_filename('zh_frames.json'), encoding='utf-8')
)


def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]')
    # We mark surface texts with * if {2} comes before {1}.
    if ftext.find('{2}') < ftext.find('{1}'):
        surfaceText = '*' + surfaceText

    start = standardized_concept_uri('zh_TW', concept1)
    end = standardized_concept_uri('zh_TW', concept2)
    source = {
        'contributor': '/s/contributor/petgame/' + user,