コード例 #1
0
ファイル: test_packer.py プロジェクト: kaharjan/spaCy
def vocab():
    path = os.environ.get('SPACY_DATA')
    if path is None:
        path = util.match_best_version('en', None, util.get_data_path())
    else:
        path = util.match_best_version('en', None, path)

    vocab = English.Defaults('en', path).Vocab()
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex = vocab['the']
    lex = vocab['quick']
    lex = vocab['jumped']
    return vocab
コード例 #2
0
# coding: utf-8

import cPickle as pickle
import re
import string

from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

STOPLIST = English.Defaults().stop_words
STOPLIST |= ENGLISH_STOP_WORDS
STOPLIST |= set(["n't", "'s", "'m", "ca"])

SYMBOLS = set(" ".join(string.punctuation).split(" ")) |\
    set(["-----", "---", "...", "“", "”", "'ve"])

nlp = English(parser=False, matcher=False)


def preprocess(doc):
    doc = doc.lower().strip()
    doc = re.sub(ur'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<URL>', doc)
    doc = re.sub(ur'#\S+', '<HASHTAG>', doc)
    doc = re.sub(ur'[-+]?[.\d]*[\d]+[:,.\d]*', '<NUMBER>', doc)
    doc = re.sub(ur'@\w+', '<USER>', doc)
    doc = doc.replace(u'\n', ' ')
    doc = doc.replace(u'\r', ' ')
    doc = doc.replace(u'/', ' / ')
    doc = re.sub(ur'\s{2,}', ' ', doc)

    defined_tags = set([u'USER', u'URL', u'HASHTAG', u'NUMBER'])