Ejemplo n.º 1
0
def concept_assoc(terms):
	terms = terms.split(',')
	result = []
	for term in terms:
		uri = concept_uri('en', term)
		result.append((uri, 1.0))
	return result
Ejemplo n.º 2
0
def concepts(terms):
	terms = terms.split(',')
	result = []
	for term in terms:
		uri = concept_uri('en', term)
		result.append(uri)
	return result
Ejemplo n.º 3
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    stemming the text if necessary, normalizing it, and joining it into a
    concept URI.

    Items in 'more' will not be stemmed, but will go through the other
    normalization steps.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    """
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    norm_text = standardize_text(text, token_filter)
    more_text = [standardize_text(item, token_filter) for item in more
                 if item is not None]
    return concept_uri(lang, norm_text, *more_text)
Ejemplo n.º 4
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    stemming the text if necessary, normalizing it, and joining it into a
    concept URI.

    Items in 'more' will not be stemmed, but will go through the other
    normalization steps.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None
    norm_text = standardize_text(text, token_filter)
    more_text = [standardize_text(item, token_filter) for item in more
                 if item is not None]
    return concept_uri(lang, norm_text, *more_text)
Ejemplo n.º 5
0
def concepts(terms):
    terms = terms.split(",")
    result = []
    for term in terms:
        uri = concept_uri("en", term)
        result.append(uri)
    return result
Ejemplo n.º 6
0
def make_assoc(terms):
    terms = terms.split(",")
    result = []
    for term in terms:
        uri = concept_uri("en", term)
        result.append((uri, 1.0))
    return result
Ejemplo n.º 7
0
def make_term():
    """
    Make and return a string representing a (synthetic) term in a randomly-
    chosen language.
    """
    global _term_count
    language = random_gen.choice(LANGUAGES, p=LANGUAGE_PROBA)
    term_text = 'term{}'.format(_term_count)
    term = concept_uri(language, term_text)
    _term_count += 1
    return term
Ejemplo n.º 8
0
def make_term():
    """
    Make and return a string representing a (synthetic) term in a randomly-
    chosen language.
    """
    global _term_count
    language = random_gen.choice(LANGUAGES, p=LANGUAGE_PROBA)
    term_text = 'term{}'.format(_term_count)
    term = concept_uri(language, term_text)
    _term_count += 1
    return term
Ejemplo n.º 9
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)
Ejemplo n.º 10
0
def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)
Ejemplo n.º 11
0
def normalized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    stemming the text if necessary, normalizing it, and joining it into a
    concept URI.

    Items in 'more' will not be stemmed, but will go through the other
    normalization steps.

    >>> normalized_concept_uri('en', 'this is a test')
    '/c/en/this_be_test'
    >>> normalized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_be_test/n/example_phrase'
    """
    norm_text = normalized_concept_name(lang, text)
    more_text = [normalize_text(item) for item in more]
    return concept_uri(lang, norm_text, *more_text)
Ejemplo n.º 12
0
def get_concept_uri(text, *more):
	norm_text = concept_name(text)
	more_text = [english_standardize_text(item) for item in more if item is not None]
	return concept_uri('en', norm_text, *more_text)