Beispiel #1
0
def POS_Ngram(N, example_set, i):
    N_grams = dict()
    count = 0
    for para in example_set:
        if i == 0: # get first sentence
            tokens = word_tokenize(para.first)
        else: # get ith sentence
            para.order_sentence()
            tokens = word_tokenize(para.ordered_sentences[i-1])
            #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1])
        tagset = None
        #print(tokens)
        tokens = _pos_tag(tokens, tagset, tagger)

        tags = [x[1] for x in tokens] # take POS tags only

        n_tags = list(ngrams(tags, N))

        for tag_set in n_tags:
            count += 1
            if tag_set in N_grams:
                N_grams[tag_set] += 1
            else:
                N_grams[tag_set] = 1 # first occurence of tagset
    # Normalize N_gram counts by total number of N grams for this set of sentences
    for ngram, num in N_grams.items():
        N_grams[ngram] = num/count
    return N_grams
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = (
        'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows, ':', nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train, test, dev, test15, smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen, 'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word, tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it % 10) == 0:
                print 'Progress:', it
            it += 1

    f = open(fout, 'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52)
    word2vec = load_glove_vec(fname,{},delimiter,ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows,':',nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train,test,dev,test15,smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen,'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word,tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it%10) == 0:
                print 'Progress:',it
            it += 1

    f = open(fout,'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Beispiel #4
0
def tokenize(raw_content, part_tag='NN'):
    tokens = list(
        gensim.utils.tokenize(raw_content,
                              lowercase=True,
                              deacc=True,
                              errors='strict',
                              to_lower=True,
                              lower=True))
    standard_stopwords = stopwords.words('english')
    tokens = [
        word for word in tokens if word.lower() not in standard_stopwords
    ]

    if part_tag is not None:
        tokens = [
            ww for ww, p in _pos_tag(tokens, None, _perceptronTagger)
            if p == part_tag
        ]
    return tokens
Beispiel #5
0
def arrayTagger(s):
	taggs = [pos for word,pos in tag._pos_tag(s, None, tagger)]
	return taggs
Beispiel #6
0
def pos_tag_tweet(tweet):
    """ POS tag tweets split already into sentences. """
    return tuple(_pos_tag(sentence, None, _TAGGER) for sentence in tweet)
 def __call__(self, tokens, tagset='universal'):
     return _pos_tag(tokens, tagset, self._tagger, self._tagger_lang)
Beispiel #8
0
 def add_token(self, paragraph):
     sentences = paragraph.scrambled_sentences
     for sentence,token in zip(sentences,self.tokens):
         pos_tokens = _pos_tag(word_tokenize(sentence), None, self.tagger)
         pos_tokens = [x[1] for x in pos_tokens]
         token.extend(pos_tokens)