def ieertree2conlltags(tree, tag=nltk.tag.pos_tag): # tree.pos() flattens the tree and produces [(word, node)] where node is # from the word's parent tree node. words in a chunk therefore get the # chunk tag, while words outside a chunk get the same tag as the tree's # top node words, ents = zip(*tree.pos()) iobs = [] prev = None # construct iob tags from entity names for ent in ents: # any entity that is the same as the tree's top node is outside a chunk if ent == tree.label(): iobs.append('O') prev = None # have a previous entity that is equal so this is inside the chunk elif prev == ent: iobs.append('I-%s' % ent) # no previous equal entity in the sequence, so this is the beginning of # an entity chunk else: iobs.append('B-%s' % ent) prev = ent # get tags for each word, then construct 3-tuple for conll tags words, tags = zip(*tag(words)) return zip(words, tags, iobs)
def load(textfile="text.txt", dictfile="dict.txt", with_rules=True, use_cache=True): print "Loading text..." load_text(textfile) print "Loading dictionary..." load_dict(dictfile) cache_exists = os.path.isfile(translated_file) cache_exists = cache_exists and os.path.isfile(tagged_file) if not use_cache or not cache_exists: print "Doing first translation pass..." translate(with_rules) print "Tagging..." tag() clear_cache() write_cache() else: print "Loading translation and tagged words from cache..." load_cache()
def main(): # if len(sys.argv) != 2: # print 'usage: python extract_segments.py output' # sys.exit(0) #c.execute('alter table irony_commentsegment add column tag text') ids, segments = extract_segments() tagged = tag(segments) update(ids, tagged)
def tokenize_tag_and_chunk(data): '''Tokenize, tag and then chunk each sentence in data''' tokenized = tokenize(data) tagged = tag(tokenized) chunked = [] for sentence in tagged: chunked.append(chunk(sentence, grammar)) return chunked
def main(): run = True while run == True: print() text = input( 'Input a question with a Proper pronoun, queston word and question detail' ) length = len(text) if text.endswith('?'): text = text[0:-1] t = text.lower() t_w = t.split(' ') error = True tokens = tag(nlp.word_tokenize(t)) print(tokens) tags = [] for i in range(0, len(tokens)): temp = tokens[i] tagged = temp[1] if tagged == 'WRB' or tagged == 'WP': tagged = 'Q_WORD' elif tagged == 'NNP' or tagged == 'NN' or tagged == 'NNS': #This needs more work i.e. countries dont work tagged = 'SUBJECT' elif tagged == 'VB' or tagged == 'VBD' or tagged == 'VBG' or tagged == 'VBN' or tagged == 'VBP' or tagged == 'VBZ': tagged = 'Q_DETAIL' else: tagged = 'NONE' tags.append(tagged) sub = getSub(tags, t_w) print('\nOriginal question :', t_w) print('Question Subject :', sub, '\n') para = searchWiki(sub) if checkExists(para) == False: print('This page cannot be found') run_refine_menu = True while run_refine_menu == True: choice = input( 'Do you want me to try and refine your search? (or try again?) [Enter "refine" or "again"]' ) if choice.lower() == 'refine': run_refine_menu = False changeFormat(para, sub) elif choice.lower() == 'again': run_refine_menu = False else: print('That is not a valid choice') else: print(para)
def main(): text = 'Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in ' \ 'Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great ' \ 'civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are ' \ 'met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final ' \ 'resting place for those who here gave their lives that that nation might live. It is altogether fitting and ' \ 'proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we ' \ 'can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far ' \ 'above our poor power to add or detract. The world will little note, nor long remember what we say here, but ' \ 'it can never forget what they did here. It is for us the living, rather, to be dedicated here to the ' \ 'unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here ' \ 'dedicated to the great task remaining before us -- that from these honored dead we take increased devotion ' \ 'to that cause for which they gave the last full measure of devotion -- that we here highly resolve that ' \ 'these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- ' \ 'and that government of the people, by the people, for the people, shall not perish from the earth.' text_tok = tok(text) text_pos = pd.DataFrame(tag(text_tok), columns=['words', 'pos tags']) text_pos.insert(0, 'key', list(range(len(text_pos.index)))) print(text_pos) repl_pos = { 'pos tags': [ 'CD', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS' ], 'pos names': [ 'number', 'adjective', 'comparative adjective', 'superlative adjective', 'singular noun', 'plural noun', 'proper noun', 'plural proper noun', 'adverb', 'comparative adverb', 'superlative adverb' ] } repl_pos = pd.DataFrame(data=repl_pos) print(repl_pos) text_repl = pd.merge(text_pos, repl_pos, on='pos tags', how='inner') text_repl['key'] = text_repl['key'].astype(int) print(text_repl) sparsity = 7 repl_num = floor(len(text_repl.index) / sparsity) replace = text_repl.sample(n=repl_num) new_words = ['_____'] * repl_num replace.insert(4, 'new words', new_words) print(replace) output = ''
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag): words, ents = zip(*tree.pos()) iobs = [] prev = None for ent in ents: if ent == tree.label(): iobs.append('O') prev = None elif prev == ent: iobs.append('I-%s' % ent) else: iobs.append('B-%s' % ent) prev = ent words, tags = zip(*tag(words)) return zip(words, tags, iobs)