def get_sentiment_count_data(train,test): sent_count_train = [] sent_count_test = [] v = DictVectorizer(sparse=False) for id in test: dist = nltk.FreqDist(products[id]['all_pos'].split()) new_dist = Counter() for tag, count in dist.iteritems(): new_dist[map_tag('en-ptb', 'universal', tag)] += count Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100) neg_count = 0 pos_count = 0 suma = 0 emotion_words = 0 for review in products[id]['reviews']: for feature,adjective,score in review['opinions']: if score is not None: if score < 0: neg_count += 1 else: pos_count += 1 suma += score emotion_words += 1 nwords = len(products[id]['all_text'].split()) eRatio = emotion_words*1.0/nwords posToAllRatio = pos_count*1.0/(pos_count+neg_count) emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio} sent_count_test.append(emotionFeatures) for id in train: dist = nltk.FreqDist(products[id]['all_pos'].split()) new_dist = Counter() for tag, count in dist.iteritems(): new_dist[map_tag('en-ptb', 'universal', tag)] += count Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100) neg_count = 0 pos_count = 0 suma = 0 emotion_words = 0 for review in products[id]['reviews']: for feature,adjective,score in review['opinions']: if score is not None: if score < 0: neg_count += 1 else: pos_count += 1 suma += score emotion_words += 1 nwords = len(products[id]['all_text'].split()) eRatio = emotion_words*1.0/nwords posToAllRatio = pos_count*1.0/(pos_count+neg_count) emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio} sent_count_train.append(emotionFeatures) X_sent_train = v.fit_transform(sent_count_train) X_sent_test = v.transform(sent_count_test) scaler = preprocessing.StandardScaler().fit(X_sent_train) X_train = scaler.transform(X_sent_train) X_test = scaler.transform(X_sent_test) return sent_count_train, sent_count_test, X_train, X_test
def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap['chunk'])))
def _get_chunked_words(self, grid, chunk_types, tagset=None): # n.b.: this method is very similar to conllstr2tree. words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] chunk_tags = self._get_column(grid, self._colmap['chunk']) stack = [Tree(self._root_label, [])] for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): if chunk_tag == 'O': state, chunk_type = 'O', '' else: (state, chunk_type) = chunk_tag.split('-') # If it's a chunk we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = 'O' # Treat a mismatching I like a B. if state == 'I' and chunk_type != stack[-1].label(): state = 'B' # For B or I: close any open chunks if state in 'BO' and len(stack) == 2: stack.pop() # For B: start a new chunk. if state == 'B': new_chunk = Tree(chunk_type, []) stack[-1].append(new_chunk) stack.append(new_chunk) # Add the word token. stack[-1].append((word, pos_tag)) return stack[0]
def get_phrase_type(phrase): tagged_phrase = nltk.pos_tag(phrase) tagged_phrase = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_phrase] result = chunk_parser.parse(tagged_phrase) phrase_type = str(result[0])[1:3] return phrase_type
def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip(self._get_column(grid, self._colmap["words"]), pos_tags, self._get_column(grid, self._colmap["chunk"])) )
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap['tree']) treestr = '' for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == '(': word = '-LRB-' if word == ')': word = '-RRB-' if pos_tag == '(': pos_tag = '-LRB-' if pos_tag == ')': pos_tag = '-RRB-' (left, right) = parse_tag.split('*') right = right.count(')') * ')' # only keep ')'. treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) try: tree = self._tree_class.parse(treestr) except (ValueError, IndexError): tree = self._tree_class.parse('(%s %s)' % (self._root_label, treestr)) if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if (isinstance(child, Tree) and len(child) == 1 and isinstance(child[0], compat.string_types)): subtree[i] = (child[0], child.label()) return tree
def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([ TaggedCorpusView( fileid, enc, True, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ])
def getPhraseTranslation1(): phrase = request.form['phrase'] language = request.form['language'] translatedPhrase = get_translation_free(phrase, language) res_english = pos_tagger_english.tag(word_tokenize(phrase)) simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in res_english] simplified_pos_tags_translated = [] if language == "fr": res_french = pos_tagger_french.tag(word_tokenize(translatedPhrase)) print(res_french, res_english) simplified_pos_tags_translated = map_french_tag_to_universal( res_french) elif language == "es": res_spanish = pos_tagger_spanish.tag(word_tokenize(translatedPhrase)) simplified_pos_tags_translated = map_spanish_tag_to_universal( res_spanish) taggedPhrase = [ '_'.join(str(i) for i in tup) for tup in simplified_pos_tags_english ] taggedTranslatedPhrase = [ '_'.join(str(i) for i in tup) for tup in simplified_pos_tags_translated ] taggedPhrase.append("NEWLINE") taggedPhrase = taggedPhrase + taggedTranslatedPhrase data = {"taggedText": taggedPhrase} print(data) return jsonify(data)
def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ TaggedCorpusView( fileid, enc, True, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ] )
def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat( [ TaggedCorpusView( fileid, enc, True, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ] )
def _tag(self, sent, tagset=None): tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)] if tagset and tagset != self._tagset: tagged_sent = [ (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent ] return tagged_sent
def _elt_to_tagged_words(self, elt, handler, tagset=None): tagged_post = [(self._simplify_username(t.attrib['word']), t.attrib['pos']) for t in elt.findall('t')] if tagset and tagset != self._tagset: tagged_post = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post] return tagged_post
def _tag(self, t, tagset=None): tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent] return tagged_sent
def tag(self, tokens): tagged = self.model.tag(tokens) if not self.tagmap: return tagged return [(word, map_tag(self.tagmap, "universal", tag)) for word, tag in tagged]
def process_review(review): tokens = nltk.word_tokenize(review) posTagged = pos_tag(tokens) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] simplifiedTags = [i for i in simplifiedTags if i[1] != 'ADP' and i[1] != 'DET'] return simplifiedTags
def make_data(): data_conll = list() tags = list() f = codecs.open('../datasets/data1/testdata_conll.txt', 'r', 'utf-8') for s in f: s = s.rstrip() if s == '': data_conll.append(tags) tags = list() else: tags.append(s) f.close() data = list() for sent in data_conll: newsent = list() for tok in sent: toklist = tok.split('\t') pos = map_tag('en-ptb', 'universal', toklist[5]) if pos != '.': word = toklist[3] attr = get_attribute(tok) newsent.append([word, pos, attr]) data.append(newsent) return data
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap['tree']) treestr = '' for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == '(': word = '-LRB-' if word == ')': word = '-RRB-' if pos_tag == '(': pos_tag = '-LRB-' if pos_tag == ')': pos_tag = '-RRB-' (left, right) = parse_tag.split('*') right = right.count(')')*')' # only keep ')'. treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) try: tree = self._tree_class.fromstring(treestr) except (ValueError, IndexError): tree = self._tree_class.fromstring('(%s %s)' % (self._root_label, treestr)) if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if (isinstance(child, Tree) and len(child)==1 and isinstance(child[0], string_types)): subtree[i] = (child[0], child.label()) return tree
def _tag(self, t, tagset=None): tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] if tagset and tagset != self._tagset: tagged_sent = [ (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent ] return tagged_sent
def compute_pos_tag(tokens): pos_tagged = nltk.pos_tag(tokens) simplified_tags = [map_tag('en-ptb', 'universal', tag) for word, tag in pos_tagged] lookup = { 'VERB': 0, 'NOUN': 1, 'PRON': 2, 'ADJ': 3, 'ADV': 4, 'ADP': 5, 'CONJ': 6, 'DET': 7, 'NUM': 8, 'PRT': 9, 'X': 10 } vector_output = [] for word in simplified_tags: word_v = numpy.zeros(11) if word in lookup: word_v[lookup[word]] = 1 vector_output.append(word_v.tolist()) return vector_output
def madlib(self, text): """Take a sentence and madlibify it, returning the result text.""" token_text = nltk.tokenize.word_tokenize(text) tagged_text = pos_tag(token_text) simplified = [(word, map_tag("en-ptb", "universal", tag)) for word, tag in tagged_text] print(simplified) new_text = [simplified[0][0], simplified[1][0]] for i in range(2, len(simplified)): word, pos = simplified[i] if (pos in self.allowed_parts and word not in self.skiplist and random.random() <= self.madlib_prob): word = self.replace_word(word, pos, simplified[i - 2:i]) new_text.append(word) line = "" for word in new_text: if is_punctuation(word): line += word else: line += " " line += clean_word(word) return line.strip()
def get_features(text): words = [] # Same steps to start as before sentences = nltk.sent_tokenize(text) for sentence in sentences: words = words + nltk.word_tokenize(sentence) # part of speech tag each of the words pos = pos_tag(words) # It's helpful to simplify the tags NLTK returns by default. pos = [map_tag('en-ptb', 'universal', tag) for word, tag in pos] # Then, convert the words to lowercase like before words = [i.lower() for i in words] # Grab the trigrams trigrams = nltk.trigrams(words) # We need to concatenate the trigrams into a single string to process trigrams = ["%s/%s/%s" % (i[0], i[1], i[2]) for i in trigrams] bigrams = nltk.bigrams(words) bigrams = ["%s/%s" % (i[0], i[1]) for i in bigrams] # Get our final dict rolling features = words + trigrams + bigrams + pos # get our feature dict rolling features = dict([(i, True) for i in features]) return features
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap["words"]) pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap["tree"]) treestr = "" for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == "(": word = "-LRB-" if word == ")": word = "-RRB-" if pos_tag == "(": pos_tag = "-LRB-" if pos_tag == ")": pos_tag = "-RRB-" (left, right) = parse_tag.split("*") right = right.count(")") * ")" # only keep ')'. treestr += "%s (%s %s) %s" % (left, pos_tag, word, right) try: tree = self._tree_class.fromstring(treestr) except (ValueError, IndexError): tree = self._tree_class.fromstring("(%s %s)" % (self._root_label, treestr)) if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if (isinstance(child, Tree) and len(child) == 1 and isinstance(child[0], str)): subtree[i] = (child[0], child.label()) return tree
def getPosTag(word): token = word_tokenize(word) tagged = nltk.pos_tag(token) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tagged] return list(sum(simplifiedTags, ()))
def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([ TaggedCorpusView( fileid, enc, True, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function, ) for (fileid, enc) in self.abspaths(fileids, True) ])
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap["words"]) pos_tags = self._get_column(grid, self._colmap["pos"]) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap["tree"]) treestr = "" for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == "(": word = "-LRB-" if word == ")": word = "-RRB-" if pos_tag == "(": pos_tag = "-LRB-" if pos_tag == ")": pos_tag = "-RRB-" (left, right) = parse_tag.split("*") right = right.count(")") * ")" # only keep ')'. treestr += "%s (%s %s) %s" % (left, pos_tag, word, right) try: tree = self._tree_class.parse(treestr) except (ValueError, IndexError): tree = self._tree_class.parse("(%s %s)" % (self._root_label, treestr)) if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if isinstance(child, Tree) and len(child) == 1 and isinstance(child[0], compat.string_types): subtree[i] = (child[0], child.label()) return tree
def count_ADJ(text): word_list = nltk.word_tokenize(text) tag_word = nltk.pos_tag(word_list) tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word) adj = tag_fd.get('ADJ') if adj is None: adj =0 return adj/len(word_list)
def count_CONJ(text): word_list = nltk.word_tokenize(text) tag_word = nltk.pos_tag(word_list) tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word) conj = tag_fd.get('CONJ') if conj is None: conj = 0 return conj/len(word_list)
def count_X(text): word_list = nltk.word_tokenize(text) tag_word = nltk.pos_tag(word_list) tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word) x = tag_fd.get('X') if x is None: x = 0 return x/len(word_list)
def tag_input(sentence): mystr = sentence tok = re.sub("[^\w]", " ", mystr).split() nltk.download('punkt') tagged_input = nltk.pos_tag(tok) simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_input] return simplified_tags
def count_PRO(text): word_list = nltk.word_tokenize(text) tag_word = nltk.pos_tag(word_list) tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word) pro = tag_fd.get('PRON') if pro is None: pro = 0 return pro/len(word_list)
def _tag(self, t, tagset=None): tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))] tagged_sent.sort() if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent] else: tagged_sent = [(w,p) for (o,w,p) in tagged_sent] return tagged_sent
def tag(self, tokens): tagged = self.model.tag(tokens) if not self.tagmap: return tagged return [(word, map_tag(self.tagmap, 'universal', tag)) for word, tag in tagged]
def get_last_word_types(text): text = nltk.word_tokenize(text) posTagged = pos_tag(text) lastword_tag = map_tag("en-ptb", "universal", posTagged[-1][1]) # known types # ['NOUN','VERB','CONJ','PRON','ADP', 'PRT', 'DET'] return lastword_tag
def count_DET(text): word_list = nltk.word_tokenize(text) tag_word = nltk.pos_tag(word_list) tag_fd = nltk.FreqDist(map_tag('en-ptb', 'universal', tag) for (word, tag)in tag_word) det = tag_fd.get('DET') if det is None: det = 0 return det/len(word_list)
def get_list_list_pos_from_list_list_tokens(list_list_tokens): list_list_pos = [] for list_tokens in list_list_tokens: list_tokens_decoded = [ x.decode('utf8') for x in list_tokens] #pos tagger needs decoded tokens list_token_pos_tuple = pos_tag(list_tokens_decoded) list_universal_pos_tag = [ map_tag('en-ptb', 'universal', tag).encode('utf8') for word, tag in list_token_pos_tuple] list_list_pos.append(list_universal_pos_tag) return list_list_pos
def tagged_sents(self, fileids=None, tagset=None): if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([IndianCorpusView(fileid, enc, True, True, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)])
def predict(input_ques=None): global m, TEXT, BATCHSIZE, k, db_cursor, conn if input_ques is None: r_json = request.json input_ques = r_json['data'] pp_input_ques = preprocess(input_ques) x, x_len = TEXT.process([pp_input_ques]) x = x.cuda() top_word_score, top_word_idx = map(lambda x: x.detach(), m.encoder(x, True)) sphinx_query = set() posTagged = pos_tag(nltk.word_tokenize(input_ques)) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] for word, tag in simplifiedTags: if (tag == 'VERB' or tag == 'NOUN') and word.lower() not in stopwords.words('english'): sphinx_query.add(word.lower()) for i, e in enumerate(top_word_idx.cpu().numpy()[0]): if top_word_score[0][i] < 1 / len(pp_input_ques) or e >= len( pp_input_ques): continue sphinx_query.add(pp_input_ques[e].lower()) sphinx_query = ' '.join(sphinx_query) print(sphinx_query) sphinx_cands, sphinx_titles, sphinx_questions = sphinx_match(sphinx_query) t = datetime.datetime.now() tag_cands = get_tags(input_ques) t2 = datetime.datetime.now() print("tag time:", t2 - t) print("tags", tag_cands) tag_cands, tag_titles, tag_questions = qid_query(tag_cands, k) t3 = datetime.datetime.now() print("qid time:", t3 - t2) titles = sphinx_titles + tag_titles questions = sphinx_questions + tag_questions cands = sphinx_cands + tag_cands y1, _ = TEXT.process([preprocess(t) for t in titles]) y2, _ = TEXT.process([preprocess(c) for c in questions]) y1, y2 = y1.cuda(), y2.cuda() d = torchIter(BATCHSIZE, x, y1, y2) agm = 0 overall_score = None for i, (bx, by1, by2) in enumerate(d): #score_title = m(bx, by1).detach() score_content = m(bx, by2).detach() score = score_content overall_score = score if overall_score is None else torch.cat( [overall_score, score], 0) overall_score = torch.nn.functional.softmax(overall_score, 1) top_scores, top_idxs = map(lambda x: x.cpu().numpy(), overall_score[:, 1].topk(1, 0)) best_p = questions[top_idxs[0]] best_a = cands[top_idxs[0]] return top_scores[0]
def Get_POS_TAG(s1): W1 = [] # print "POS Tagging of : ",s1 wrds = Preprocess_Sentence(s1) # print "After Preprocessing : ",wrds Tagged_words1 = pos_tag(wrds) # print "Tagged Words are :",Tagged_words1 simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in Tagged_words1] return simplifiedTags
def find_keywords(review): text = word_tokenize(review.decode("utf8")) tagged_review = pos_tag(text) simplified_tagged_review = [(word,map_tag('en-ptb','universal',tag)) for word, tag in tagged_review] keywords = [] for word,tag in simplified_tagged_review: if isKeywordTag(tag): keywords += [word] return keywords
def get_sentiment_score(string, sentiment_dict): sentiment = 0 tokens = tokenize(string) parts_of_speech = nltk.pos_tag(tokens) simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in parts_of_speech] for token in simplified_tags: sentiment += lookup_sentiment_score(token[0], token[1], sentiment_dict) return sentiment
def _get_iob_words(self, grid, tagset=None, column=ConllCorpusReader.CHUNK): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap[column])))
def process_tag(phrase, target): text = nltk.word_tokenize(phrase) posTagged = pos_tag(text) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] res = '' for (word, tag) in simplifiedTags: if tag in target: res += word + ' ' return res.strip()
def get_tag_sequence(sent): """ :param sent: A sentence to tag :return: Tags in order. """ mytok = nltk.word_tokenize(sent) tags = nltk.pos_tag(mytok) tags = [map_tag('en-ptb', 'universal', tag) for word, tag in tags] return tags
def _elt_to_tagged_words(self, elt, handler, tagset=None): tagged_post = [ (self._simplify_username(t.attrib['word']), t.attrib['pos']) for t in elt.findall('t') ] if tagset and tagset != self._tagset: tagged_post = [ (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post ] return tagged_post
def pos_tag(sent): final = [] for i in range(len(sent)): split = nltk.word_tokenize(sent[i]) pos = nltk.pos_tag(split) sim_pos = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos] #uses simpler universal tags pos_arr = [i[1] for i in sim_pos] final.append(pos_arr) return final
def add_paragraph_no_punctuation(paragraph, lexicon): """ Will take a in a paragraph, and place each sentence into the lexicon. Removes punctuation from the string. See add_sentence. NOTE: Contractions are an issue with this. """ tokenizer = RegexpTokenizer(r'\w+') tokenized = tokenizer.tokenize(paragraph) tags = nltk.pos_tag(tokenized) tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags] place_tagged_in_lexicon(tags, lexicon)
def _parse_utterance(self, utterance, include_tag, tagset=None): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError('Bad utterance %r' % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w, t) in words] elif tagset and tagset != self._tagset: words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words] return SwitchboardTurn(words, speaker, id)
def add_paragraph_no_punctuation(paragraph, lexicon): """ Will take a in a paragraph, and place each sentence into the lexicon. Removes punctuation from the string. See add_sentence. NOTE: Contractions are an issue with this. """ tokenizer = RegexpTokenizer(r'\w+') tokenized= tokenizer.tokenize(paragraph) tags = nltk.pos_tag(tokenized) tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags] place_tagged_in_lexicon(tags, lexicon)
def get_pos_bigrams(): train_data_file_line_label_tuple_list = get_training_data() train_data_all_file_lines_universal_tag_bigram_counter = Counter() train_data_file_line_universal_tag_bigram_counter_list = [] for train_data_file_line_label_tuple in train_data_file_line_label_tuple_list: train_data_file_line_tokens = word_tokenize( train_data_file_line_label_tuple[0].decode('utf8').lower()) train_data_file_line_token_tag_tuple_list = pos_tag( train_data_file_line_tokens) #print train_data_file_line_label_tuple, "\n", [ (word, map_tag('en-ptb', 'universal', tag).encode('utf8')) for word, tag in train_data_file_line_token_tag_tuple_list] train_data_file_line_universal_tag_list = [ map_tag('en-ptb', 'universal', tag).encode('utf8') for word, tag in train_data_file_line_token_tag_tuple_list ] train_data_file_line_universal_tag_bigram_list = list( ngrams(train_data_file_line_universal_tag_list, 2)) #print train_data_file_line_label_tuple, "\n", train_data_file_line_universal_tag_bigram_list train_data_file_line_universal_tag_bigram_counter = Counter( train_data_file_line_universal_tag_bigram_list) train_data_file_line_universal_tag_bigram_counter_list.append( train_data_file_line_universal_tag_bigram_counter) #print train_data_file_line_universal_tag_bigram_counter.most_common() train_data_all_file_lines_universal_tag_bigram_counter = train_data_all_file_lines_universal_tag_bigram_counter + train_data_file_line_universal_tag_bigram_counter #print len(train_data_all_file_lines_universal_tag_bigram_counter) all_pos_bigrams_list = sorted([ x[0] for x in train_data_all_file_lines_universal_tag_bigram_counter.most_common() ]) train_data_pos_bigram_vector_list = list() for train_data_file_line_universal_tag_bigram_counter in train_data_file_line_universal_tag_bigram_counter_list: train_data_pos_bigram_vector = list() for pos_bigram in all_pos_bigrams_list: if train_data_file_line_universal_tag_bigram_counter[pos_bigram]: train_data_pos_bigram_vector.append( train_data_file_line_universal_tag_bigram_counter[ pos_bigram]) else: train_data_pos_bigram_vector.append(0) train_data_pos_bigram_vector_list.append(train_data_pos_bigram_vector) outfile1 = "train_data_pos_bigram_vector_10000_most_common" fo1 = open(outfile1, 'w') for train_data_pos_bigram_vector in train_data_pos_bigram_vector_list: for index, train_data_pos_bigram_vector_element in enumerate( train_data_pos_bigram_vector): fo1.write(str(train_data_pos_bigram_vector_element)) if (index != len(train_data_pos_bigram_vector) - 1): fo1.write(" ") fo1.write("\n")
def save_brown_tagged(window_pos, leftORright): # leftORright will tell whether to look on left or right of the word to generate # POS tag. train_text = file('../data/brownuntagged.txt').readlines() tagged_corpra_list = [] corpra_dict = {} word_list = [] for count, line in enumerate(train_text): text = nltk.word_tokenize(line) posTagged = pos_tag(text) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] tagged_corpra_list += simplifiedTags word_list += text print count if leftORright == 'left' : for count, (word, tag) in enumerate(tagged_corpra_list[window_pos:]): if word not in corpra_dict: corpra_dict[word] = {} joined_tag = '' for i in range(window_pos): joined_tag += tagged_corpra_list[count+i][1] if joined_tag not in corpra_dict[word]: corpra_dict[word][joined_tag] = 0 corpra_dict[word][joined_tag] += 1 if leftORright == 'right': for count, (word, tag) in enumerate(tagged_corpra_list[:-window_pos]): if word not in corpra_dict: corpra_dict[word] = {} joined_tag = '' for i in range(window_pos): joined_tag += tagged_corpra_list[count+i+1][1] if joined_tag not in corpra_dict[word]: corpra_dict[word][joined_tag] = 0 corpra_dict[word][joined_tag] += 1 if leftORright == 'both': for count, (word, tag) in enumerate(tagged_corpra_list[window_pos:-window_pos]): if word not in corpra_dict: corpra_dict[word] = {} joined_tag = '' for i in range(window_pos): joined_tag += tagged_corpra_list[count+i][1] temp_count = count+window_pos+1 for i in range(window_pos): joined_tag += tagged_corpra_list[temp_count+i][1] if joined_tag not in corpra_dict[word]: corpra_dict[word][joined_tag] = 0 corpra_dict[word][joined_tag] += 1 pickle_file = open('../data/brown_tagged_win' + str(window_pos) + '_' + leftORright + '.save', 'wb') cPickle.dump(corpra_dict, pickle_file, protocol=cPickle.HIGHEST_PROTOCOL) pickle_file.close()
def get_word_count_dict(sentence): wcount_dict = Counter() for (word,tag) in \ [(w, map_tag('en-ptb','universal',t)) for (w,t) in nltk.pos_tag(nltk.word_tokenize(preprocess(sentence)))]: if tag in wntags.keys(): wcount_dict.update(wn.synsets(word,wntags[tag])) wcount_dict.update([word]) tot_len = sum(wcount_dict.values()) if tot_len==0: print sentence return(wcount_dict, tot_len)
def add_sentence(sent, lexicon): """ Takes in a sentence and adds it to the lexicon. Example: Input: "These are a bunch of words to be processed" Tagged: {'VERB': ['are', 'be', 'processed'], 'ADP': ['of'], 'NOUN': ['bunch', 'words'], 'DET': ['these', 'a'], 'PRT': ['to']} :param sent: A sentence to be processed, tagged, and placed in the lexicon """ mytok = nltk.word_tokenize(sent) tags = nltk.pos_tag(mytok) tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags] place_tagged_in_lexicon(tags, lexicon)
def noun_verb(one_day_news_list): dict_sentence_noun_verb = dict() string = titles_string(titles_no_stopwords(dataprep(one_day_news_list))) for i in range(len(dataprep(one_day_news_list))): list_sentence_noun_verb = [] text = nltk.word_tokenize(string[i]) posTagged = pos_tag(text) simplifiedTags = [(word, map_tag("en-brown", "universal", tag)) for word, tag in posTagged] for (w, t) in simplifiedTags: if t.startswith("N"): list_sentence_noun_verb.append(w) elif t.startswith("V"): list_sentence_noun_verb.append(w) dict_sentence_noun_verb[i] = list_sentence_noun_verb return dict_sentence_noun_verb
def printNounCounts(inputfileName,outputfileName): file_content = open(inputfileName,"r") tokens=[] for line in file_content.readlines(): tokens += nltk.word_tokenize(line.lower()) tokens = filter(lambda a: a!='@' and a!=')',tokens) posTagged=nltk.pos_tag(tokens) NounTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged if tag=="NN" or tag =="NNP" or tag=="NNS"] nounCountList=[] outputFile = open(outputfileName,"w") for noun in NounTags: nounCountList += [(tokens.count(noun[0]),noun[0])] nounCountList = sorted(list(set(nounCountList)))[::-1] for nounindex in range(len(nounCountList)): outputFile.write(nounCountList[nounindex][1] + " " + str(nounCountList[nounindex][0]) + "\n")