def get_sentiment_count_data(train,test):
	sent_count_train = []
	sent_count_test = []
	v = DictVectorizer(sparse=False)
	for id in test:
		dist = nltk.FreqDist(products[id]['all_pos'].split())
		new_dist = Counter()
		for tag, count in dist.iteritems():
			new_dist[map_tag('en-ptb', 'universal', tag)] += count
		Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100)
		neg_count = 0
		pos_count = 0
		suma = 0
		emotion_words = 0
		for review in products[id]['reviews']:        
			for feature,adjective,score in review['opinions']:
				if score is not None:
					if score < 0:
						neg_count += 1
					else:
						pos_count += 1
					suma += score
					emotion_words += 1
		nwords = len(products[id]['all_text'].split())
		eRatio = emotion_words*1.0/nwords
		posToAllRatio = pos_count*1.0/(pos_count+neg_count)
		emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio}
		sent_count_test.append(emotionFeatures)
	for id in train:
		dist = nltk.FreqDist(products[id]['all_pos'].split())
		new_dist = Counter()
		for tag, count in dist.iteritems():
			new_dist[map_tag('en-ptb', 'universal', tag)] += count
		Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100)
		neg_count = 0
		pos_count = 0
		suma = 0
		emotion_words = 0
		for review in products[id]['reviews']:
			for feature,adjective,score in review['opinions']:
				if score is not None:
					if score < 0:
						neg_count += 1
					else:
						pos_count += 1
					suma += score
					emotion_words += 1
		nwords = len(products[id]['all_text'].split())
		eRatio = emotion_words*1.0/nwords
		posToAllRatio = pos_count*1.0/(pos_count+neg_count)
		emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio}
		sent_count_train.append(emotionFeatures)

	X_sent_train = v.fit_transform(sent_count_train)
	X_sent_test = v.transform(sent_count_test)
	scaler = preprocessing.StandardScaler().fit(X_sent_train)
	X_train = scaler.transform(X_sent_train)
	X_test = scaler.transform(X_sent_test)

	return sent_count_train, sent_count_test, X_train, X_test
Esempio n. 2
0
 def _get_iob_words(self, grid, tagset=None):
     pos_tags = self._get_column(grid, self._colmap['pos'])
     if tagset and tagset != self._tagset:
         pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
     return list(
         zip(self._get_column(grid, self._colmap['words']), pos_tags,
             self._get_column(grid, self._colmap['chunk'])))
Esempio n. 3
0
    def _get_chunked_words(self, grid, chunk_types, tagset=None):
        # n.b.: this method is very similar to conllstr2tree.
        words = self._get_column(grid, self._colmap['words'])
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        chunk_tags = self._get_column(grid, self._colmap['chunk'])

        stack = [Tree(self._root_label, [])]

        for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
            if chunk_tag == 'O':
                state, chunk_type = 'O', ''
            else:
                (state, chunk_type) = chunk_tag.split('-')
            # If it's a chunk we don't care about, treat it as O.
            if chunk_types is not None and chunk_type not in chunk_types:
                state = 'O'
            # Treat a mismatching I like a B.
            if state == 'I' and chunk_type != stack[-1].label():
                state = 'B'
            # For B or I: close any open chunks
            if state in 'BO' and len(stack) == 2:
                stack.pop()
            # For B: start a new chunk.
            if state == 'B':
                new_chunk = Tree(chunk_type, [])
                stack[-1].append(new_chunk)
                stack.append(new_chunk)
            # Add the word token.
            stack[-1].append((word, pos_tag))

        return stack[0]
Esempio n. 4
0
def get_phrase_type(phrase):
    tagged_phrase = nltk.pos_tag(phrase)
    tagged_phrase = [(word, map_tag('en-ptb', 'universal', tag))
                     for word, tag in tagged_phrase]
    result = chunk_parser.parse(tagged_phrase)
    phrase_type = str(result[0])[1:3]
    return phrase_type
Esempio n. 5
0
 def _get_iob_words(self, grid, tagset=None):
     pos_tags = self._get_column(grid, self._colmap["pos"])
     if tagset and tagset != self._tagset:
         pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
     return list(
         zip(self._get_column(grid, self._colmap["words"]), pos_tags, self._get_column(grid, self._colmap["chunk"]))
     )
Esempio n. 6
0
    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
        words = self._get_column(grid, self._colmap['words'])
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        parse_tags = self._get_column(grid, self._colmap['tree'])

        treestr = ''
        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
            if word == '(': word = '-LRB-'
            if word == ')': word = '-RRB-'
            if pos_tag == '(': pos_tag = '-LRB-'
            if pos_tag == ')': pos_tag = '-RRB-'
            (left, right) = parse_tag.split('*')
            right = right.count(')') * ')'  # only keep ')'.
            treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
        try:
            tree = self._tree_class.parse(treestr)
        except (ValueError, IndexError):
            tree = self._tree_class.parse('(%s %s)' %
                                          (self._root_label, treestr))

        if not pos_in_tree:
            for subtree in tree.subtrees():
                for i, child in enumerate(subtree):
                    if (isinstance(child, Tree) and len(child) == 1
                            and isinstance(child[0], compat.string_types)):
                        subtree[i] = (child[0], child.label())

        return tree
Esempio n. 7
0
 def tagged_paras(self, fileids=None, tagset=None):
     """
     :return: the given file(s) as a list of
         paragraphs, each encoded as a list of sentences, which are
         in turn encoded as lists of ``(word,tag)`` tuples.
     :rtype: list(list(list(tuple(str,str))))
     """
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat([
         TaggedCorpusView(
             fileid,
             enc,
             True,
             True,
             True,
             self._sep,
             self._word_tokenizer,
             self._sent_tokenizer,
             self._para_block_reader,
             tag_mapping_function,
         ) for (fileid, enc) in self.abspaths(fileids, True)
     ])
Esempio n. 8
0
def getPhraseTranslation1():
    phrase = request.form['phrase']
    language = request.form['language']

    translatedPhrase = get_translation_free(phrase, language)

    res_english = pos_tagger_english.tag(word_tokenize(phrase))
    simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag))
                                   for word, tag in res_english]
    simplified_pos_tags_translated = []
    if language == "fr":
        res_french = pos_tagger_french.tag(word_tokenize(translatedPhrase))
        print(res_french, res_english)
        simplified_pos_tags_translated = map_french_tag_to_universal(
            res_french)

    elif language == "es":
        res_spanish = pos_tagger_spanish.tag(word_tokenize(translatedPhrase))
        simplified_pos_tags_translated = map_spanish_tag_to_universal(
            res_spanish)

    taggedPhrase = [
        '_'.join(str(i) for i in tup) for tup in simplified_pos_tags_english
    ]
    taggedTranslatedPhrase = [
        '_'.join(str(i) for i in tup) for tup in simplified_pos_tags_translated
    ]
    taggedPhrase.append("NEWLINE")
    taggedPhrase = taggedPhrase + taggedTranslatedPhrase
    data = {"taggedText": taggedPhrase}
    print(data)
    return jsonify(data)
Esempio n. 9
0
 def tagged_words(self, fileids=None, tagset=None):
     """
     :return: the given file(s) as a list of tagged
         words and punctuation symbols, encoded as tuples
         ``(word,tag)``.
     :rtype: list(tuple(str,str))
     """
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat(
         [
             TaggedCorpusView(
                 fileid,
                 enc,
                 True,
                 False,
                 False,
                 self._sep,
                 self._word_tokenizer,
                 self._sent_tokenizer,
                 self._para_block_reader,
                 tag_mapping_function,
             )
             for (fileid, enc) in self.abspaths(fileids, True)
         ]
     )
Esempio n. 10
0
 def tagged_paras(self, fileids=None, tagset=None):
     """
     :return: the given file(s) as a list of
         paragraphs, each encoded as a list of sentences, which are
         in turn encoded as lists of ``(word,tag)`` tuples.
     :rtype: list(list(list(tuple(str,str))))
     """
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat(
         [
             TaggedCorpusView(
                 fileid,
                 enc,
                 True,
                 True,
                 True,
                 self._sep,
                 self._word_tokenizer,
                 self._sent_tokenizer,
                 self._para_block_reader,
                 tag_mapping_function,
             )
             for (fileid, enc) in self.abspaths(fileids, True)
         ]
     )
Esempio n. 11
0
 def _tag(self, sent, tagset=None):
     tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
     if tagset and tagset != self._tagset:
         tagged_sent = [
             (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
         ]
     return tagged_sent
Esempio n. 12
0
 def _elt_to_tagged_words(self, elt, handler, tagset=None):
     tagged_post = [(self._simplify_username(t.attrib['word']),
                     t.attrib['pos']) for t in elt.findall('t')]
     if tagset and tagset != self._tagset:
         tagged_post = [(w, map_tag(self._tagset, tagset, t))
                        for (w, t) in tagged_post]
     return tagged_post
Esempio n. 13
0
 def _tag(self, t, tagset=None):
     tagged_sent = [(w, p)
                    for (p, w) in TAGWORD.findall(self._normalize(t))]
     if tagset and tagset != self._tagset:
         tagged_sent = [(w, map_tag(self._tagset, tagset, p))
                        for (w, p) in tagged_sent]
     return tagged_sent
Esempio n. 14
0
    def tag(self, tokens):
        tagged = self.model.tag(tokens)

        if not self.tagmap:
            return tagged

        return [(word, map_tag(self.tagmap, "universal", tag)) for word, tag in tagged]
Esempio n. 15
0
def process_review(review):
    tokens = nltk.word_tokenize(review)
    posTagged = pos_tag(tokens)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
    simplifiedTags = [i for i in simplifiedTags if i[1] != 'ADP' and i[1] != 'DET']

    return simplifiedTags
def make_data():
    data_conll = list()
    tags = list()
    f = codecs.open('../datasets/data1/testdata_conll.txt', 'r', 'utf-8')
    for s in f:
        s = s.rstrip()
        if s == '':
            data_conll.append(tags)
            tags = list()
        else:
            tags.append(s)
    f.close()

    data = list()
    for sent in data_conll:
        newsent = list()
        for tok in sent:
            toklist = tok.split('\t')
            pos = map_tag('en-ptb', 'universal', toklist[5])
            if pos != '.':
                word = toklist[3]
                attr = get_attribute(tok)
                newsent.append([word, pos, attr])
        data.append(newsent)
    return data
Esempio n. 17
0
    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
        words = self._get_column(grid, self._colmap['words'])
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        parse_tags = self._get_column(grid, self._colmap['tree'])

        treestr = ''
        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
            if word == '(': word = '-LRB-'
            if word == ')': word = '-RRB-'
            if pos_tag == '(': pos_tag = '-LRB-'
            if pos_tag == ')': pos_tag = '-RRB-'
            (left, right) = parse_tag.split('*')
            right = right.count(')')*')' # only keep ')'.
            treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
        try:
            tree = self._tree_class.fromstring(treestr)
        except (ValueError, IndexError):
            tree = self._tree_class.fromstring('(%s %s)' %
                                          (self._root_label, treestr))

        if not pos_in_tree:
            for subtree in tree.subtrees():
                for i, child in enumerate(subtree):
                    if (isinstance(child, Tree) and len(child)==1 and
                        isinstance(child[0], string_types)):
                        subtree[i] = (child[0], child.label())

        return tree
Esempio n. 18
0
 def _tag(self, t, tagset=None):
     tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
     if tagset and tagset != self._tagset:
         tagged_sent = [
             (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
         ]
     return tagged_sent
    def compute_pos_tag(tokens):

        pos_tagged = nltk.pos_tag(tokens)
        simplified_tags = [map_tag('en-ptb', 'universal', tag) for word, tag in pos_tagged]
        lookup = {
            'VERB': 0,
            'NOUN': 1,
            'PRON': 2,
            'ADJ': 3,
            'ADV': 4,
            'ADP': 5,
            'CONJ': 6,
            'DET': 7,
            'NUM': 8,
            'PRT': 9,
            'X': 10
        }

        vector_output = []
        for word in simplified_tags:
            word_v = numpy.zeros(11)
            if word in lookup:
                word_v[lookup[word]] = 1

            vector_output.append(word_v.tolist())
        return vector_output
Esempio n. 20
0
 def _tag(self, sent, tagset=None):
     tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
     if tagset and tagset != self._tagset:
         tagged_sent = [
             (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
         ]
     return tagged_sent
Esempio n. 21
0
    def madlib(self, text):
        """Take a sentence and madlibify it, returning the result text."""
        token_text = nltk.tokenize.word_tokenize(text)
        tagged_text = pos_tag(token_text)
        simplified = [(word, map_tag("en-ptb", "universal", tag))
                      for word, tag in tagged_text]
        print(simplified)

        new_text = [simplified[0][0], simplified[1][0]]
        for i in range(2, len(simplified)):
            word, pos = simplified[i]
            if (pos in self.allowed_parts and word not in self.skiplist
                    and random.random() <= self.madlib_prob):
                word = self.replace_word(word, pos, simplified[i - 2:i])

            new_text.append(word)

        line = ""
        for word in new_text:
            if is_punctuation(word):
                line += word
            else:
                line += " "
                line += clean_word(word)
        return line.strip()
def get_features(text):
    words = []
    # Same steps to start as before
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = words + nltk.word_tokenize(sentence)

    # part of speech tag each of the words
    pos = pos_tag(words)
    # It's helpful to simplify the tags NLTK returns by default.
    pos = [map_tag('en-ptb', 'universal', tag) for word, tag in pos]
    # Then, convert the words to lowercase like before
    words = [i.lower() for i in words]
    # Grab the trigrams
    trigrams = nltk.trigrams(words)
    # We need to concatenate the trigrams into a single string to process
    trigrams = ["%s/%s/%s" % (i[0], i[1], i[2]) for i in trigrams]

    bigrams = nltk.bigrams(words)
    bigrams = ["%s/%s" % (i[0], i[1]) for i in bigrams]

    # Get our final dict rolling
    features = words + trigrams + bigrams + pos
    # get our feature dict rolling
    features = dict([(i, True) for i in features])
    return features
Esempio n. 23
0
    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
        words = self._get_column(grid, self._colmap["words"])
        pos_tags = self._get_column(grid, self._colmap["pos"])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        parse_tags = self._get_column(grid, self._colmap["tree"])

        treestr = ""
        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
            if word == "(":
                word = "-LRB-"
            if word == ")":
                word = "-RRB-"
            if pos_tag == "(":
                pos_tag = "-LRB-"
            if pos_tag == ")":
                pos_tag = "-RRB-"
            (left, right) = parse_tag.split("*")
            right = right.count(")") * ")"  # only keep ')'.
            treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
        try:
            tree = self._tree_class.fromstring(treestr)
        except (ValueError, IndexError):
            tree = self._tree_class.fromstring("(%s %s)" %
                                               (self._root_label, treestr))

        if not pos_in_tree:
            for subtree in tree.subtrees():
                for i, child in enumerate(subtree):
                    if (isinstance(child, Tree) and len(child) == 1
                            and isinstance(child[0], str)):
                        subtree[i] = (child[0], child.label())

        return tree
Esempio n. 24
0
def getPosTag(word):

    token = word_tokenize(word)
    tagged = nltk.pos_tag(token)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag))
                      for word, tag in tagged]
    return list(sum(simplifiedTags, ()))
Esempio n. 25
0
 def tagged_words(self, fileids=None, tagset=None):
     """
     :return: the given file(s) as a list of tagged
         words and punctuation symbols, encoded as tuples
         ``(word,tag)``.
     :rtype: list(tuple(str,str))
     """
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat([
         TaggedCorpusView(
             fileid,
             enc,
             True,
             False,
             False,
             self._sep,
             self._word_tokenizer,
             self._sent_tokenizer,
             self._para_block_reader,
             tag_mapping_function,
         ) for (fileid, enc) in self.abspaths(fileids, True)
     ])
Esempio n. 26
0
    def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
        words = self._get_column(grid, self._colmap["words"])
        pos_tags = self._get_column(grid, self._colmap["pos"])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        parse_tags = self._get_column(grid, self._colmap["tree"])

        treestr = ""
        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
            if word == "(":
                word = "-LRB-"
            if word == ")":
                word = "-RRB-"
            if pos_tag == "(":
                pos_tag = "-LRB-"
            if pos_tag == ")":
                pos_tag = "-RRB-"
            (left, right) = parse_tag.split("*")
            right = right.count(")") * ")"  # only keep ')'.
            treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
        try:
            tree = self._tree_class.parse(treestr)
        except (ValueError, IndexError):
            tree = self._tree_class.parse("(%s %s)" % (self._root_label, treestr))

        if not pos_in_tree:
            for subtree in tree.subtrees():
                for i, child in enumerate(subtree):
                    if isinstance(child, Tree) and len(child) == 1 and isinstance(child[0], compat.string_types):
                        subtree[i] = (child[0], child.label())

        return tree
Esempio n. 27
0
    def _get_chunked_words(self, grid, chunk_types, tagset=None):
        # n.b.: this method is very similar to conllstr2tree.
        words = self._get_column(grid, self._colmap['words'])
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        chunk_tags = self._get_column(grid, self._colmap['chunk'])

        stack = [Tree(self._root_label, [])]

        for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
            if chunk_tag == 'O':
                state, chunk_type = 'O', ''
            else:
                (state, chunk_type) = chunk_tag.split('-')
            # If it's a chunk we don't care about, treat it as O.
            if chunk_types is not None and chunk_type not in chunk_types:
                state = 'O'
            # Treat a mismatching I like a B.
            if state == 'I' and chunk_type != stack[-1].label():
                state = 'B'
            # For B or I: close any open chunks
            if state in 'BO' and len(stack) == 2:
                stack.pop()
            # For B: start a new chunk.
            if state == 'B':
                new_chunk = Tree(chunk_type, [])
                stack[-1].append(new_chunk)
                stack.append(new_chunk)
            # Add the word token.
            stack[-1].append((word, pos_tag))

        return stack[0]
Esempio n. 28
0
def count_ADJ(text):
    word_list = nltk.word_tokenize(text)
    tag_word = nltk.pos_tag(word_list)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word)
    adj = tag_fd.get('ADJ')
    if adj is None:
        adj =0
    return adj/len(word_list)
Esempio n. 29
0
def count_CONJ(text):
    word_list = nltk.word_tokenize(text)
    tag_word = nltk.pos_tag(word_list)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word)
    conj = tag_fd.get('CONJ')
    if conj is None:
        conj = 0
    return conj/len(word_list)
Esempio n. 30
0
def count_X(text):
    word_list = nltk.word_tokenize(text)
    tag_word = nltk.pos_tag(word_list)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word)
    x = tag_fd.get('X')
    if x is None:
        x = 0
    return x/len(word_list)
Esempio n. 31
0
def tag_input(sentence):
    mystr = sentence
    tok = re.sub("[^\w]", " ", mystr).split()
    nltk.download('punkt')
    tagged_input = nltk.pos_tag(tok)
    simplified_tags = [(word, map_tag('en-ptb', 'universal', tag))
                       for word, tag in tagged_input]
    return simplified_tags
Esempio n. 32
0
def count_PRO(text):
    word_list = nltk.word_tokenize(text)
    tag_word = nltk.pos_tag(word_list)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word)
    pro = tag_fd.get('PRON')
    if pro is None:
        pro = 0
    return pro/len(word_list)
Esempio n. 33
0
 def _tag(self, t, tagset=None):
     tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
     tagged_sent.sort()
     if tagset and tagset != self._tagset:
         tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
     else:
         tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
     return tagged_sent
Esempio n. 34
0
 def _tag(self, t, tagset=None):
     tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
     tagged_sent.sort()
     if tagset and tagset != self._tagset:
         tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
     else:
         tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
     return tagged_sent
Esempio n. 35
0
    def tag(self, tokens):
        tagged = self.model.tag(tokens)

        if not self.tagmap:
            return tagged

        return [(word, map_tag(self.tagmap, 'universal', tag))
                for word, tag in tagged]
Esempio n. 36
0
def get_last_word_types(text):
    text = nltk.word_tokenize(text)
    posTagged = pos_tag(text)
    lastword_tag = map_tag("en-ptb", "universal", posTagged[-1][1])

    # known types
    # ['NOUN','VERB','CONJ','PRON','ADP', 'PRT', 'DET']
    return lastword_tag
Esempio n. 37
0
def count_DET(text):
    word_list = nltk.word_tokenize(text)
    tag_word = nltk.pos_tag(word_list)
    tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word)
    det = tag_fd.get('DET')
    if det is None:
        det = 0
    return det/len(word_list)
def get_list_list_pos_from_list_list_tokens(list_list_tokens):
	list_list_pos = []
	for list_tokens in list_list_tokens:
		list_tokens_decoded = [ x.decode('utf8') for x in list_tokens] #pos tagger needs decoded tokens
		list_token_pos_tuple = pos_tag(list_tokens_decoded)
		list_universal_pos_tag = [ map_tag('en-ptb', 'universal', tag).encode('utf8') for word, tag in list_token_pos_tuple]
		list_list_pos.append(list_universal_pos_tag)
	return list_list_pos
Esempio n. 39
0
 def tagged_sents(self, fileids=None, tagset=None):
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat([IndianCorpusView(fileid, enc,
                                     True, True, tag_mapping_function)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Esempio n. 40
0
 def tagged_sents(self, fileids=None, tagset=None):
     if tagset and tagset != self._tagset:
         tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
     else:
         tag_mapping_function = None
     return concat([IndianCorpusView(fileid, enc,
                                     True, True, tag_mapping_function)
                    for (fileid, enc) in self.abspaths(fileids, True)])
Esempio n. 41
0
def predict(input_ques=None):
    global m, TEXT, BATCHSIZE, k, db_cursor, conn
    if input_ques is None:
        r_json = request.json
        input_ques = r_json['data']
    pp_input_ques = preprocess(input_ques)
    x, x_len = TEXT.process([pp_input_ques])
    x = x.cuda()
    top_word_score, top_word_idx = map(lambda x: x.detach(),
                                       m.encoder(x, True))
    sphinx_query = set()
    posTagged = pos_tag(nltk.word_tokenize(input_ques))
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag))
                      for word, tag in posTagged]
    for word, tag in simplifiedTags:
        if (tag == 'VERB' or tag
                == 'NOUN') and word.lower() not in stopwords.words('english'):
            sphinx_query.add(word.lower())
    for i, e in enumerate(top_word_idx.cpu().numpy()[0]):
        if top_word_score[0][i] < 1 / len(pp_input_ques) or e >= len(
                pp_input_ques):
            continue
        sphinx_query.add(pp_input_ques[e].lower())
    sphinx_query = ' '.join(sphinx_query)
    print(sphinx_query)
    sphinx_cands, sphinx_titles, sphinx_questions = sphinx_match(sphinx_query)
    t = datetime.datetime.now()
    tag_cands = get_tags(input_ques)
    t2 = datetime.datetime.now()
    print("tag time:", t2 - t)
    print("tags", tag_cands)
    tag_cands, tag_titles, tag_questions = qid_query(tag_cands, k)
    t3 = datetime.datetime.now()
    print("qid time:", t3 - t2)

    titles = sphinx_titles + tag_titles
    questions = sphinx_questions + tag_questions
    cands = sphinx_cands + tag_cands

    y1, _ = TEXT.process([preprocess(t) for t in titles])
    y2, _ = TEXT.process([preprocess(c) for c in questions])
    y1, y2 = y1.cuda(), y2.cuda()
    d = torchIter(BATCHSIZE, x, y1, y2)
    agm = 0
    overall_score = None
    for i, (bx, by1, by2) in enumerate(d):
        #score_title = m(bx, by1).detach()
        score_content = m(bx, by2).detach()
        score = score_content
        overall_score = score if overall_score is None else torch.cat(
            [overall_score, score], 0)

    overall_score = torch.nn.functional.softmax(overall_score, 1)
    top_scores, top_idxs = map(lambda x: x.cpu().numpy(),
                               overall_score[:, 1].topk(1, 0))
    best_p = questions[top_idxs[0]]
    best_a = cands[top_idxs[0]]
    return top_scores[0]
Esempio n. 42
0
def Get_POS_TAG(s1):
    W1 = []
    # print "POS Tagging of : ",s1
    wrds = Preprocess_Sentence(s1)
    # print "After Preprocessing : ",wrds
    Tagged_words1 = pos_tag(wrds)
    # print "Tagged Words are :",Tagged_words1
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in Tagged_words1]
    return simplifiedTags
Esempio n. 43
0
def find_keywords(review):
	text = word_tokenize(review.decode("utf8"))
	tagged_review = pos_tag(text)
	simplified_tagged_review = [(word,map_tag('en-ptb','universal',tag)) for word, tag in tagged_review]
	keywords = []
	for word,tag in simplified_tagged_review:
		if isKeywordTag(tag):
			keywords += [word]
	return keywords
Esempio n. 44
0
def get_sentiment_score(string, sentiment_dict):
    sentiment = 0
    tokens = tokenize(string)
    parts_of_speech = nltk.pos_tag(tokens)
    simplified_tags = [(word, map_tag('en-ptb', 'universal', tag))
                       for word, tag in parts_of_speech]
    for token in simplified_tags:
        sentiment += lookup_sentiment_score(token[0], token[1], sentiment_dict)
    return sentiment
Esempio n. 45
0
 def _get_iob_words(self,
                    grid,
                    tagset=None,
                    column=ConllCorpusReader.CHUNK):
     pos_tags = self._get_column(grid, self._colmap['pos'])
     if tagset and tagset != self._tagset:
         pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
     return list(
         zip(self._get_column(grid, self._colmap['words']), pos_tags,
             self._get_column(grid, self._colmap[column])))
def process_tag(phrase, target):
    text = nltk.word_tokenize(phrase)
    posTagged = pos_tag(text)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag))
                      for word, tag in posTagged]
    res = ''
    for (word, tag) in simplifiedTags:
        if tag in target:
            res += word + ' '
    return res.strip()
Esempio n. 47
0
def get_tag_sequence(sent):
    """

    :param sent: A sentence to tag
    :return: Tags in order.
    """
    mytok = nltk.word_tokenize(sent)
    tags = nltk.pos_tag(mytok)
    tags = [map_tag('en-ptb', 'universal', tag) for word, tag in tags]
    return tags
Esempio n. 48
0
def get_tag_sequence(sent):
    """

    :param sent: A sentence to tag
    :return: Tags in order.
    """
    mytok = nltk.word_tokenize(sent)
    tags = nltk.pos_tag(mytok)
    tags = [map_tag('en-ptb', 'universal', tag) for word, tag in tags]
    return tags
Esempio n. 49
0
 def _elt_to_tagged_words(self, elt, handler, tagset=None):
     tagged_post = [
         (self._simplify_username(t.attrib['word']), t.attrib['pos'])
         for t in elt.findall('t')
     ]
     if tagset and tagset != self._tagset:
         tagged_post = [
             (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
         ]
     return tagged_post
def pos_tag(sent):
    final = []
    for i in range(len(sent)):
        split = nltk.word_tokenize(sent[i])
        pos = nltk.pos_tag(split)
        sim_pos = [(word, map_tag('en-ptb', 'universal', tag))
                   for word, tag in pos]  #uses simpler universal tags
        pos_arr = [i[1] for i in sim_pos]
        final.append(pos_arr)
    return final
Esempio n. 51
0
def add_paragraph_no_punctuation(paragraph, lexicon):
    """
    Will take a in a paragraph, and place each sentence into the lexicon.
    Removes punctuation from the string. See add_sentence.
    NOTE: Contractions are an issue with this.
    """
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(paragraph)
    tags = nltk.pos_tag(tokenized)
    tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags]
    place_tagged_in_lexicon(tags, lexicon)
Esempio n. 52
0
 def _parse_utterance(self, utterance, include_tag, tagset=None):
     m = self._UTTERANCE_RE.match(utterance)
     if m is None:
         raise ValueError('Bad utterance %r' % utterance)
     speaker, id, text = m.groups()
     words = [str2tuple(s, self._SEP) for s in text.split()]
     if not include_tag:
         words = [w for (w, t) in words]
     elif tagset and tagset != self._tagset:
         words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
     return SwitchboardTurn(words, speaker, id)
Esempio n. 53
0
def add_paragraph_no_punctuation(paragraph, lexicon):
    """
    Will take a in a paragraph, and place each sentence into the lexicon.
    Removes punctuation from the string. See add_sentence.
    NOTE: Contractions are an issue with this.
    """
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized= tokenizer.tokenize(paragraph)
    tags = nltk.pos_tag(tokenized)
    tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags]
    place_tagged_in_lexicon(tags, lexicon)
Esempio n. 54
0
def get_pos_bigrams():
    train_data_file_line_label_tuple_list = get_training_data()
    train_data_all_file_lines_universal_tag_bigram_counter = Counter()
    train_data_file_line_universal_tag_bigram_counter_list = []
    for train_data_file_line_label_tuple in train_data_file_line_label_tuple_list:
        train_data_file_line_tokens = word_tokenize(
            train_data_file_line_label_tuple[0].decode('utf8').lower())
        train_data_file_line_token_tag_tuple_list = pos_tag(
            train_data_file_line_tokens)
        #print train_data_file_line_label_tuple, "\n", [ (word, map_tag('en-ptb', 'universal', tag).encode('utf8')) for word, tag in train_data_file_line_token_tag_tuple_list]
        train_data_file_line_universal_tag_list = [
            map_tag('en-ptb', 'universal', tag).encode('utf8')
            for word, tag in train_data_file_line_token_tag_tuple_list
        ]
        train_data_file_line_universal_tag_bigram_list = list(
            ngrams(train_data_file_line_universal_tag_list, 2))
        #print train_data_file_line_label_tuple, "\n", train_data_file_line_universal_tag_bigram_list
        train_data_file_line_universal_tag_bigram_counter = Counter(
            train_data_file_line_universal_tag_bigram_list)
        train_data_file_line_universal_tag_bigram_counter_list.append(
            train_data_file_line_universal_tag_bigram_counter)
        #print train_data_file_line_universal_tag_bigram_counter.most_common()
        train_data_all_file_lines_universal_tag_bigram_counter = train_data_all_file_lines_universal_tag_bigram_counter + train_data_file_line_universal_tag_bigram_counter

    #print len(train_data_all_file_lines_universal_tag_bigram_counter)

    all_pos_bigrams_list = sorted([
        x[0] for x in
        train_data_all_file_lines_universal_tag_bigram_counter.most_common()
    ])

    train_data_pos_bigram_vector_list = list()

    for train_data_file_line_universal_tag_bigram_counter in train_data_file_line_universal_tag_bigram_counter_list:
        train_data_pos_bigram_vector = list()
        for pos_bigram in all_pos_bigrams_list:
            if train_data_file_line_universal_tag_bigram_counter[pos_bigram]:
                train_data_pos_bigram_vector.append(
                    train_data_file_line_universal_tag_bigram_counter[
                        pos_bigram])
            else:
                train_data_pos_bigram_vector.append(0)
        train_data_pos_bigram_vector_list.append(train_data_pos_bigram_vector)

    outfile1 = "train_data_pos_bigram_vector_10000_most_common"
    fo1 = open(outfile1, 'w')

    for train_data_pos_bigram_vector in train_data_pos_bigram_vector_list:
        for index, train_data_pos_bigram_vector_element in enumerate(
                train_data_pos_bigram_vector):
            fo1.write(str(train_data_pos_bigram_vector_element))
            if (index != len(train_data_pos_bigram_vector) - 1):
                fo1.write(" ")
        fo1.write("\n")
Esempio n. 55
0
def save_brown_tagged(window_pos, leftORright):
    # leftORright will tell whether to look on left or right of the word to generate
    # POS tag.
    train_text = file('../data/brownuntagged.txt').readlines()
    tagged_corpra_list = []
    corpra_dict = {}
    word_list = []
    for count, line in enumerate(train_text):
        text = nltk.word_tokenize(line)
        posTagged = pos_tag(text)
        simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
        tagged_corpra_list += simplifiedTags
        word_list += text
        print count
    if leftORright == 'left' :
        for count, (word, tag) in enumerate(tagged_corpra_list[window_pos:]):
            if word not in corpra_dict:
                corpra_dict[word] = {}
            joined_tag = ''
            for i in range(window_pos):
                joined_tag += tagged_corpra_list[count+i][1]
            if joined_tag not in corpra_dict[word]:
                corpra_dict[word][joined_tag] = 0
            corpra_dict[word][joined_tag] += 1

    if leftORright == 'right':
         for count, (word, tag) in enumerate(tagged_corpra_list[:-window_pos]):
            if word not in corpra_dict:
                corpra_dict[word] = {}
            joined_tag = ''
            for i in range(window_pos):
                joined_tag += tagged_corpra_list[count+i+1][1]
            if joined_tag not in corpra_dict[word]:
                corpra_dict[word][joined_tag] = 0
            corpra_dict[word][joined_tag] += 1

    if leftORright == 'both':
         for count, (word, tag) in enumerate(tagged_corpra_list[window_pos:-window_pos]):
            if word not in corpra_dict:
                corpra_dict[word] = {}
            joined_tag = ''
            for i in range(window_pos):
                joined_tag += tagged_corpra_list[count+i][1]
            temp_count = count+window_pos+1
            for i in range(window_pos):
                joined_tag += tagged_corpra_list[temp_count+i][1]
            if joined_tag not in corpra_dict[word]:
                corpra_dict[word][joined_tag] = 0
            corpra_dict[word][joined_tag] += 1

    pickle_file = open('../data/brown_tagged_win' + str(window_pos) + '_' + leftORright + '.save', 'wb')
    cPickle.dump(corpra_dict, pickle_file, protocol=cPickle.HIGHEST_PROTOCOL)
    pickle_file.close()
Esempio n. 56
0
def get_word_count_dict(sentence):
    wcount_dict = Counter()

    for (word,tag) in \
        [(w, map_tag('en-ptb','universal',t))
         for (w,t) in nltk.pos_tag(nltk.word_tokenize(preprocess(sentence)))]:
        if tag in wntags.keys():
            wcount_dict.update(wn.synsets(word,wntags[tag]))
            wcount_dict.update([word])
    tot_len = sum(wcount_dict.values())
    if tot_len==0:
        print sentence
    return(wcount_dict, tot_len)
Esempio n. 57
0
def add_sentence(sent, lexicon):
    """
    Takes in a sentence and adds it to the lexicon.
    Example:
        Input:
            "These are a bunch of words to be processed"
        Tagged:
            {'VERB': ['are', 'be', 'processed'], 'ADP': ['of'], 'NOUN': ['bunch', 'words'], 'DET': ['these', 'a'], 'PRT': ['to']}
    :param sent: A sentence to be processed, tagged, and placed in the lexicon
    """
    mytok = nltk.word_tokenize(sent)
    tags = nltk.pos_tag(mytok)
    tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags]
    place_tagged_in_lexicon(tags, lexicon)
Esempio n. 58
0
def noun_verb(one_day_news_list):
    dict_sentence_noun_verb = dict()
    string = titles_string(titles_no_stopwords(dataprep(one_day_news_list)))
    for i in range(len(dataprep(one_day_news_list))):
        list_sentence_noun_verb = []
        text = nltk.word_tokenize(string[i])
        posTagged = pos_tag(text)
        simplifiedTags = [(word, map_tag("en-brown", "universal", tag)) for word, tag in posTagged]
        for (w, t) in simplifiedTags:
            if t.startswith("N"):
                list_sentence_noun_verb.append(w)
            elif t.startswith("V"):
                list_sentence_noun_verb.append(w)
        dict_sentence_noun_verb[i] = list_sentence_noun_verb
    return dict_sentence_noun_verb
Esempio n. 59
0
def printNounCounts(inputfileName,outputfileName):
	file_content = open(inputfileName,"r")
	tokens=[]
	for line in file_content.readlines():
		tokens += nltk.word_tokenize(line.lower())
	tokens = filter(lambda a: a!='@' and a!=')',tokens)
	posTagged=nltk.pos_tag(tokens)
	NounTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged if tag=="NN" or tag =="NNP" or tag=="NNS"]
	nounCountList=[]
	outputFile = open(outputfileName,"w")
	for noun in NounTags:
		nounCountList += [(tokens.count(noun[0]),noun[0])]
	nounCountList  = sorted(list(set(nounCountList)))[::-1]
	for nounindex in range(len(nounCountList)):
		outputFile.write(nounCountList[nounindex][1] + " " + str(nounCountList[nounindex][0]) + "\n")