def vocab(): path = os.environ.get('SPACY_DATA') if path is None: path = util.match_best_version('en', None, util.get_data_path()) else: path = util.match_best_version('en', None, path) vocab = English.Defaults('en', path).Vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
# coding: utf-8 import cPickle as pickle import re import string from spacy.en import English from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS STOPLIST = English.Defaults().stop_words STOPLIST |= ENGLISH_STOP_WORDS STOPLIST |= set(["n't", "'s", "'m", "ca"]) SYMBOLS = set(" ".join(string.punctuation).split(" ")) |\ set(["-----", "---", "...", "“", "”", "'ve"]) nlp = English(parser=False, matcher=False) def preprocess(doc): doc = doc.lower().strip() doc = re.sub(ur'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<URL>', doc) doc = re.sub(ur'#\S+', '<HASHTAG>', doc) doc = re.sub(ur'[-+]?[.\d]*[\d]+[:,.\d]*', '<NUMBER>', doc) doc = re.sub(ur'@\w+', '<USER>', doc) doc = doc.replace(u'\n', ' ') doc = doc.replace(u'\r', ' ') doc = doc.replace(u'/', ' / ') doc = re.sub(ur'\s{2,}', ' ', doc) defined_tags = set([u'USER', u'URL', u'HASHTAG', u'NUMBER'])