Python simple_tokenizeの例、wordfreq.simple_tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: opencyc.py プロジェクト: CoSeCant-csc/conceptnet5

def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(
                opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label,
                             obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith(
                'http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()

コード例 #2

0

ファイルを表示

def cyc_to_conceptnet_uri(labels, unlabels, uri):
    """
    Convert a Cyc URI to a ConceptNet URI, with the following rules:

    - Use the RDF label as the text. (Alternate labels appear to provide
      synonyms, but these are generally automatically generated and aren't
      particularly accurate.)
    - The part of speech is always 'n'. Cyc describes its concepts in a
      noun-like way. At worst, they're gerunds -- instead of "to eat", Cyc
      would define an event of "Eating".
    - If two different Cyc URIs have the same text, we will attempt to
      disambiguate them using the last component of the Cyc URI.
    - Remove the camel-casing from the Cyc URI component. If the phrase we
      get is the same as the natural-language label, disregard it as an
      uninformative disambiguation. Otherwise, that is the disambiguation text.

    A possible objection: Our disambiguation doesn't distinguish Cyc URIs that
    differ in capitalization, or differ by using underscores instead of
    camel-case. However, I've noticed that such URIs are usually
    *unintentional* duplicates that are okay to merge. If they were really
    unrelated concepts that needed to be distinguished, someone would have
    given them different names.

    Even so, we end up with some unnecessary word senses, such as different
    senses for "mens clothing", "men's clothing", and "men s clothing".
    """
    label = filter_stopwords(labels[uri])
    if len(unlabels[label]) >= 2:
        disambig = filter_stopwords(un_camel_case(resource_name(uri)))
        if simple_tokenize(disambig) != simple_tokenize(label):
            return standardized_concept_uri('en', label, 'n', 'opencyc', disambig)
    return standardized_concept_uri('en', label, 'n')

コード例 #3

0

ファイルを表示

ファイル: word_counts.py プロジェクト: xeroskiller/wordfreq

def read_freqs(filename, cutoff=0, lang=None):
    """
    Read words and their frequencies from a CSV file.

    Only words with a frequency greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by frequency
    in descending order.

    If lang is given, read_freqs will apply language specific preprocessing
    operations.
    """
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            if val < cutoff:
                break
            tokens = tokenize(
                key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                raw_counts[fix_text(token)] += val
                total += val

    for word in raw_counts:
        raw_counts[word] /= total

    return raw_counts

コード例 #4

0

ファイルを表示

def read_values(filename, cutoff=0, max_words=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.
    In addition, only up to `max_words` words are read.

    If `cutoff` is greater than 0 or `max_words` is smaller than the list,
    the csv file must be sorted by value in descending order, so that the
    most frequent words are kept.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_words:
                break
            tokens = tokenize(
                key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total

コード例 #5

0

ファイルを表示

ファイル: word_counts.py プロジェクト: KadriUmay/wordfreq

def read_values(filename, cutoff=0, max_size=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_size:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total

コード例 #6

0

ファイルを表示

def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.
    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'
    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'
    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'
    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'
    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'
    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'
    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'
    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'
    >>> preprocess_and_tokenize_text('en', '_')
    ''
    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)

コード例 #7

0

ファイルを表示

def filter_stopwords(text):
    words = [
        word for word in simple_tokenize(text) if word not in MORE_STOPWORDS
    ]
    text2 = ' '.join(words)
    if not text2:
        text2 = text
    return text2

コード例 #8

0

ファイルを表示

ファイル: query.py プロジェクト: zababurinsv/conceptnet5

 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.simple_tokenize(text)
     weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, oov_vector=False)

コード例 #9

0

ファイルを表示

def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)

コード例 #10

0

ファイルを表示

ファイル: nodes.py プロジェクト: commonsense/conceptnet5

def standardized_concept_uri(lang, text, *more):
    """
    Make the appropriate URI for a concept in a particular language, including
    removing English stopwords, normalizing the text in a way appropriate
    to that language (using the text normalization from wordfreq), and joining
    its tokens with underscores in a concept URI.

    This text normalization can smooth over some writing differences: for
    example, it removes vowel points from Arabic words, and it transliterates
    Serbian written in the Cyrillic alphabet to the Latin alphabet so that it
    can match other words written in Latin letters.

    'more' contains information to distinguish word senses, such as a part
    of speech or a WordNet domain. The items in 'more' get lowercased and
    joined with underscores, but skip many of the other steps -- for example,
    they won't have stopwords removed.

    >>> standardized_concept_uri('en', 'this is a test')
    '/c/en/this_is_test'
    >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase')
    '/c/en/this_is_test/n/example_phrase'
    >>> standardized_concept_uri('sh', 'симетрија')
    '/c/sh/simetrija'
    """
    lang = lang.lower()
    if lang in LCODE_ALIASES:
        lang = LCODE_ALIASES[lang]
    if lang == 'en':
        token_filter = english_filter
    else:
        token_filter = None

    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    if token_filter is not None:
        tokens = token_filter(tokens)
    norm_text = '_'.join(tokens)
    more_text = []
    for item in more:
        if item is not None:
            tokens = simple_tokenize(item.replace('_', ' '))
            if token_filter is not None:
                tokens = token_filter(tokens)
            more_text.append('_'.join(tokens))

    return concept_uri(lang, norm_text, *more_text)

コード例 #11

0

ファイルを表示

ファイル: conceptnet4.py プロジェクト: commonsense/conceptnet5

def standardize_username(username):
    """
    Convert usernames into a canonical form that can be used in URIs.

    If the username is an e-mail address, just keep the part before the @ sign.
    """
    name = username.strip('@').split('@')[0]
    name = '_'.join(simple_tokenize(name.replace('_', ' ')))
    return name

コード例 #12

0

ファイルを表示

ファイル: conceptnet4.py プロジェクト: zhaoguangxiang/conceptnet5

def standardize_username(username):
    """
    Convert usernames into a canonical form that can be used in URIs.

    If the username is an e-mail address, just keep the part before the @ sign.
    """
    name = username.strip('@').split('@')[0]
    name = '_'.join(simple_tokenize(name.replace('_', ' ')))
    return name

コード例 #13

0

ファイルを表示

ファイル: test_japanese.py プロジェクト: yaskapp/wordfreq

def test_simple_tokenize():
    # When Japanese is run through simple_tokenize -- either because it's
    # tagged with the wrong language, or because we want to pass through
    # Japanese text without getting MeCab involved -- it will be split at
    # boundaries between Japanese and non-Japanese scripts, but all Japanese
    # scripts will be stuck together. Here the switch between hiragana
    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
    # between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
    assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]

コード例 #14

0

ファイルを表示

ファイル: test_japanese.py プロジェクト: LuminosoInsight/wordfreq

def test_simple_tokenize():
    # When Japanese is run through simple_tokenize -- either because it's
    # tagged with the wrong language, or because we want to pass through
    # Japanese text without getting MeCab involved -- it will be split at
    # boundaries between Japanese and non-Japanese scripts, but all Japanese
    # scripts will be stuck together. Here the switch between hiragana
    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
    # between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
    

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
    assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]

コード例 #15

0

ファイルを表示

ファイル: nodes.py プロジェクト: terU3760/conceptnet5

def standardize_text(text, token_filter=None):
    """
    Get a string made from the tokens in the text, joined by
    underscores. The tokens may have a language-specific `token_filter`
    applied to them. See `standardize_as_list()`.

    >>> standardize_text(' cat')
    'cat'

    >>> standardize_text('a big dog', token_filter=english_filter)
    'big_dog'

    >>> standardize_text('Italian supercat')
    'italian_supercat'

    >>> standardize_text('a big dog')
    'a_big_dog'

    >>> standardize_text('a big dog', token_filter=english_filter)
    'big_dog'

    >>> standardize_text('to go', token_filter=english_filter)
    'go'

    >>> standardize_text('Test?!')
    'test'

    >>> standardize_text('TEST.')
    'test'

    >>> standardize_text('test/test')
    'test_test'

    >>> standardize_text('   u\N{COMBINING DIAERESIS}ber\\n')
    'über'

    >>> standardize_text('embedded' + chr(9) + 'tab')
    'embedded_tab'

    >>> standardize_text('_')
    ''

    >>> standardize_text(',')
    ''
    """
    tokens = simple_tokenize(text.replace('_', ' '))
    if token_filter is not None:
        tokens = token_filter(tokens)
    return '_'.join(tokens)

コード例 #16

0

ファイルを表示

ファイル: word_counts.py プロジェクト: dragon788/wordfreq

def count_tokens(filename):
    """
    Count tokens that appear in a file, running each line through our
    simple tokenizer.

    URLs will be skipped, and Unicode errors will become separate tokens
    containing '�'.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1

    return counts

コード例 #17

0

ファイルを表示

ファイル: word_counts.py プロジェクト: xeroskiller/wordfreq

def count_tokens(filename):
    """
    Count tokens that appear in a file, running each line through our
    simple tokenizer.

    URLs will be skipped, and Unicode errors will become separate tokens
    containing '�'.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1

    return counts

コード例 #18

0

ファイルを表示

ファイル: tokenize.py プロジェクト: pombredanne/glove-1

def main(in_file, out_file, err):
    print("TOKENIZING", file=err)

    processed = 0
    last_print = 0
    step = 100000

    for line in in_file:
        if processed - last_print > step:
            last_print += step
            print("\033[1K\rProcessed %s tokens"%step, file=err, end='')

        tokens = simple_tokenize(line)
        print(" ".join(tokens), file=out_file)
        processed += len(tokens)

    print("\033[1K\rProcessed %s tokens"%processed, file=err)
    print(file=err)

コード例 #19

0

ファイルを表示

ファイル: tokenize.py プロジェクト: LuminosoInsight/glove

def main(in_file, out_file, err):
    print("TOKENIZING", file=err)

    processed = 0
    last_print = 0
    step = 100000

    for line in in_file:
        if processed - last_print > step:
            last_print += step
            print("\033[1K\rProcessed %s tokens" % step, file=err, end='')

        tokens = simple_tokenize(line)
        print(" ".join(tokens), file=out_file)
        processed += len(tokens)

    print("\033[1K\rProcessed %s tokens" % processed, file=err)
    print(file=err)

コード例 #20

0

ファイルを表示

def valid_concept_name(text):
    """
    Returns whether this text can be reasonably represented in a concept
    URI. This helps to protect against making useless concepts out of
    empty strings or punctuation.
    >>> valid_concept_name('word')
    True
    >>> valid_concept_name('the')
    True
    >>> valid_concept_name(',,')
    False
    >>> valid_concept_name(',')
    False
    >>> valid_concept_name('/')
    False
    >>> valid_concept_name(' ')
    False
    """
    tokens = simple_tokenize(text.replace('_', ' '))
    return len(tokens) > 0

コード例 #21

0

ファイルを表示

ファイル: nodes.py プロジェクト: commonsense/conceptnet5

def valid_concept_name(text):
    """
    Returns whether this text can be reasonably represented in a concept
    URI. This helps to protect against making useless concepts out of
    empty strings or punctuation.

    >>> valid_concept_name('word')
    True
    >>> valid_concept_name('the')
    True
    >>> valid_concept_name(',,')
    False
    >>> valid_concept_name(',')
    False
    >>> valid_concept_name('/')
    False
    >>> valid_concept_name(' ')
    False
    """
    tokens = simple_tokenize(text.replace('_', ' '))
    return len(tokens) > 0

コード例 #22

0

ファイルを表示

ファイル: conceptnet4.py プロジェクト: commonsense/conceptnet5

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri(
        '/s/contributor/omcs', standardize_username(parts_dict["creator"])
    )
    creator_source['contributor'] = creator_node

    activity = parts_dict["activity"]
    activity = '_'.join(simple_tokenize(activity.replace('_', ' ')))
    activity_node = join_uri('/s/activity/omcs', activity)
    creator_source['activity'] = activity_node

    if preposition_fix:
        creator_source['process'] = '/s/process/preposition_fix'
    creator_source['weight'] = 1.
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            'contributor': join_uri(
                '/s/contributor/omcs', standardize_username(username)
            ),
            'activity': '/s/activity/omcs/vote',
            'weight': float(vote_int),
        }
        sources.append(vote_source)
    return sources

コード例 #23

0

ファイルを表示

ファイル: conceptnet4.py プロジェクト: zhaoguangxiang/conceptnet5

def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri('/s/contributor/omcs',
                            standardize_username(parts_dict["creator"]))
    creator_source['contributor'] = creator_node

    activity = parts_dict["activity"]
    activity = '_'.join(simple_tokenize(activity.replace('_', ' ')))
    activity_node = join_uri('/s/activity/omcs', activity)
    creator_source['activity'] = activity_node

    if preposition_fix:
        creator_source['process'] = '/s/process/preposition_fix'
    creator_source['weight'] = 1.
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            'contributor':
            join_uri('/s/contributor/omcs', standardize_username(username)),
            'activity':
            '/s/activity/omcs/vote',
            'weight':
            float(vote_int)
        }
        sources.append(vote_source)
    return sources

コード例 #24

0

ファイルを表示

ファイル: nodes.py プロジェクト: commonsense/conceptnet5

def preprocess_and_tokenize_text(lang, text):
    """
    Get a string made from the tokens in the text, joined by
    underscores.

    >>> preprocess_and_tokenize_text('en', ' cat')
    'cat'

    >>> preprocess_and_tokenize_text('en', 'Italian supercat')
    'italian_supercat'

    >>> preprocess_and_tokenize_text('en', 'a big dog')
    'a_big_dog'

    >>> preprocess_and_tokenize_text('en', 'Test?!')
    'test'

    >>> preprocess_and_tokenize_text('en', 'TEST.')
    'test'

    >>> preprocess_and_tokenize_text('en', 'test/test')
    'test_test'

    >>> preprocess_and_tokenize_text('de', '   u\N{COMBINING DIAERESIS}ber\\n')
    'über'

    >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab')
    'embedded_tab'

    >>> preprocess_and_tokenize_text('en', '_')
    ''

    >>> preprocess_and_tokenize_text('en', ',')
    ''
    """
    text = preprocess_text(text.replace('_', ' '), lang)
    tokens = simple_tokenize(text)
    return '_'.join(tokens)

コード例 #25

0

ファイルを表示

new_tweets = api.GetUserTimeline(screen_name="realDonaldTrump", count=200)
all_tweets = []
all_tweets.extend(new_tweets)
while len(new_tweets) != 0:
    oldest_id = all_tweets[-1].id - 1
    new_tweets = api.GetUserTimeline(screen_name="realDonaldTrump",
                                     count=200, max_id=oldest_id)
    all_tweets.extend(new_tweets)
    print("{} tweets retrieved so far...".format(len(all_tweets)))
trump_tweets = [tweet.full_text for tweet in all_tweets]

# Now, let's take a look at Trump's 10 most recent tweets just for kicks
print(trump_tweets[:10])

# Let's take a look at a few of Trump's most commonly-used words
tokenized_tweets = [simple_tokenize(tweet) for tweet in trump_tweets]
counts = {}
for tokenized in tokenized_tweets:
    for word in tokenized:
        if word not in stopwords.words("english"):
        if word != "https" and word != "t.co" and word != "rt":
            if word not in counts:
                counts[word] = 1
            else:
                counts[word] += 1
sorted_counts = sorted(counts, key=counts.get, reverse=True)
top_20_keys = sorted_counts[:20]
top_20_values = [counts[key] for key in top_20_keys]
plt.style.use("ggplot")
plt.bar(top_20_keys, top_20_values, color="blue")
plt.ylabel("Frequencies")

コード例 #26

0

ファイルを表示

ファイル: conceptnet4.py プロジェクト: commonsense/conceptnet5

def filter_stopwords(text):
    words = [word for word in simple_tokenize(text) if word not in MORE_STOPWORDS]
    text2 = ' '.join(words)
    if not text2:
        text2 = text
    return text2

コード例 #27

0

ファイルを表示

ファイル: opencyc.py プロジェクト: commonsense/conceptnet5

def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if (
            rel_name == 'subClassOf'
            and web_obj is not None
            and web_subj in labels
            and web_obj in labels
        ):
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif (
            rel_name == 'sameAs'
            and web_subj in labels
            and web_obj.startswith('http://umbel.org/')
        ):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()