def POS_Ngram(N, example_set, i): N_grams = dict() count = 0 for para in example_set: if i == 0: # get first sentence tokens = word_tokenize(para.first) else: # get ith sentence para.order_sentence() tokens = word_tokenize(para.ordered_sentences[i-1]) #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1]) tagset = None #print(tokens) tokens = _pos_tag(tokens, tagset, tagger) tags = [x[1] for x in tokens] # take POS tags only n_tags = list(ngrams(tags, N)) for tag_set in n_tags: count += 1 if tag_set in N_grams: N_grams[tag_set] += 1 else: N_grams[tag_set] = 1 # first occurence of tagset # Normalize N_gram counts by total number of N grams for this set of sentences for ngram, num in N_grams.items(): N_grams[ngram] = num/count return N_grams
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ( 'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows, ':', nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train, test, dev, test15, smiley_pos] for filen in files: for tweet in gzip.open(filen, 'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word, tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it % 10) == 0: print 'Progress:', it it += 1 f = open(fout, 'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52) word2vec = load_glove_vec(fname,{},delimiter,ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows,':',nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train,test,dev,test15,smiley_pos] for filen in files: for tweet in gzip.open(filen,'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word,tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it%10) == 0: print 'Progress:',it it += 1 f = open(fout,'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def tokenize(raw_content, part_tag='NN'): tokens = list( gensim.utils.tokenize(raw_content, lowercase=True, deacc=True, errors='strict', to_lower=True, lower=True)) standard_stopwords = stopwords.words('english') tokens = [ word for word in tokens if word.lower() not in standard_stopwords ] if part_tag is not None: tokens = [ ww for ww, p in _pos_tag(tokens, None, _perceptronTagger) if p == part_tag ] return tokens
def arrayTagger(s): taggs = [pos for word,pos in tag._pos_tag(s, None, tagger)] return taggs
def pos_tag_tweet(tweet): """ POS tag tweets split already into sentences. """ return tuple(_pos_tag(sentence, None, _TAGGER) for sentence in tweet)
def __call__(self, tokens, tagset='universal'): return _pos_tag(tokens, tagset, self._tagger, self._tagger_lang)
def add_token(self, paragraph): sentences = paragraph.scrambled_sentences for sentence,token in zip(sentences,self.tokens): pos_tokens = _pos_tag(word_tokenize(sentence), None, self.tagger) pos_tokens = [x[1] for x in pos_tokens] token.extend(pos_tokens)