def frequency_vectorize_gensim(corpus): corpus = [tokenize(doc) for doc in corpus] id2word = gensim.corpora.Dictionary(corpus) vectors = [ id2word.doc2bow(doc) for doc in corpus ] return vectors
def one_hot_vectorize_gensim(corpus): corpus = [tokenize(doc) for doc in corpus] id2word = gensim.corpora.Dictionary(corpus) vectors = [ [(token[0], 1) for token in id2word.doc2bow(doc)] for doc in corpus ] return vectors
def post_number(sentence): words = utils.tokenize(sentence) abb_map = mapper.load(POSTNUM_ABB_PATH, first_caps=True) for i in range(1, len(words)): word = utils.strip(words[i-1]) if word in abb_map and utils.is_number(utils.strip(words[i])): words[i-1] = utils.replace(words[i-1], str(abb_map[word])) return " ".join(words)
def basic(sentence): words = utils.tokenize(sentence) abb_map = mapper.load(ABB_PATH) for i in range(0, len(words)): word = utils.strip(words[i]) if word in abb_map: words[i] = utils.replace(words[i], str(abb_map[word])) return " ".join(words)
def full_name(sentence): words = utils.tokenize(sentence) first_list = lister.load(FIRSTNAME_PATH) sur_list = lister.load(SURNAME_PATH) for i in range(1, len(words)): first = utils.strip(words[i-1]) last = utils.strip(words[i]) if last in sur_list and first in first_list: words[i-1] = utils.replace(words[i-1], str(first[:1] + ".")) return " ".join(words)
def build_model(self, corpus): """ Build bigram model :param corpus: Space separated string of all sentences :return: """ words = utils.tokenize(corpus) self.word_model = Counter(words) # Count(word) bigrams = list(utils.get_chunks( words, 2)) #Can be changed to any arbitrary ngrams self.bigram_model = defaultdict(Counter) # Count(word2|word1) for tup in bigrams: try: self.bigram_model[tup[0]][tup[1]] += 1 except: pass self.save_model()
def execute(sentence): words = utils.tokenize(sentence) parsed = [] last_ok = -1 num_map = mapper.load(NUM_PATH) for i in range(0, len(words)): word = words[i] changed = utils.strip(word) if changed in num_map and last_ok < i: number = 0 buffer = 0 last_ok = i for j in range(i, len(words)): actual = words[j] actual = utils.strip(actual) if actual not in num_map: break if num_map[actual] == '1000000': if buffer == 0: break else: number += 1000000 * buffer buffer = 0 elif num_map[actual] == '1000': if buffer == 0: break else: number += 1000 * buffer buffer = 0 elif num_map[actual] == '100': buffer = buffer * 100 else: buffer += int(num_map[actual]) last_ok = j if actual != words[j]: break number += buffer parsed.append(utils.replace(words[last_ok], str(number))) else: if last_ok < i: parsed.append(utils.replace(word, changed)) return " ".join(parsed)
def tf_idf_vectorize_gensim(corpus): corpus = [tokenize(doc) for doc in corpus] lexicon = gensim.corpora.Dictionary(corpus) tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True) vectors = [tfidf[lexicon.doc2bow(doc)] for doc in corpus] return vectors
def frequency_vectorize_nltk(doc): features = defaultdict(int) for token in tokenize(doc): features[token] +=1 return features
def one_hot_vectorize_nltk(doc): vectors = { token: True for token in tokenize(doc) } return vectors