Ejemplo n.º 1
0
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
	# tree.pos() flattens the tree and produces [(word, node)] where node is
	# from the word's parent tree node. words in a chunk therefore get the
	# chunk tag, while words outside a chunk get the same tag as the tree's
	# top node
	words, ents = zip(*tree.pos())
	iobs = []
	prev = None
	# construct iob tags from entity names
	for ent in ents:
		# any entity that is the same as the tree's top node is outside a chunk
		if ent == tree.label():
			iobs.append('O')
			prev = None
		# have a previous entity that is equal so this is inside the chunk
		elif prev == ent:
			iobs.append('I-%s' % ent)
		# no previous equal entity in the sequence, so this is the beginning of
		# an entity chunk
		else:
			iobs.append('B-%s' % ent)
			prev = ent
	# get tags for each word, then construct 3-tuple for conll tags
	words, tags = zip(*tag(words))
	return zip(words, tags, iobs)
Ejemplo n.º 2
0
def load(textfile="text.txt", dictfile="dict.txt", with_rules=True, use_cache=True):
    print "Loading text..."
    load_text(textfile)
    print "Loading dictionary..."
    load_dict(dictfile)
    cache_exists = os.path.isfile(translated_file)
    cache_exists = cache_exists and os.path.isfile(tagged_file)
    if not use_cache or not cache_exists:
        print "Doing first translation pass..."
        translate(with_rules)
        print "Tagging..."
        tag()
        clear_cache()
        write_cache()
    else:
        print "Loading translation and tagged words from cache..."
        load_cache()
Ejemplo n.º 3
0
def main():
    # if len(sys.argv) != 2:
    #   print 'usage: python extract_segments.py output'
    #   sys.exit(0)
    #c.execute('alter table irony_commentsegment add column tag text')

    ids, segments = extract_segments()
    tagged = tag(segments)
    update(ids, tagged)
Ejemplo n.º 4
0
def main():
    # if len(sys.argv) != 2:
    #   print 'usage: python extract_segments.py output'
    #   sys.exit(0)
    #c.execute('alter table irony_commentsegment add column tag text')

    ids, segments = extract_segments()
    tagged = tag(segments)
    update(ids, tagged)
Ejemplo n.º 5
0
def tokenize_tag_and_chunk(data):
  '''Tokenize, tag and then chunk each sentence in data'''
  tokenized = tokenize(data)
   
  tagged = tag(tokenized)

  chunked = []
  
  for sentence in tagged:
    chunked.append(chunk(sentence, grammar))
    
  return chunked
Ejemplo n.º 6
0
def main():

    run = True

    while run == True:
        print()
        text = input(
            'Input a question with a Proper pronoun, queston word and question detail'
        )
        length = len(text)
        if text.endswith('?'):
            text = text[0:-1]

        t = text.lower()
        t_w = t.split(' ')
        error = True
        tokens = tag(nlp.word_tokenize(t))
        print(tokens)
        tags = []
        for i in range(0, len(tokens)):
            temp = tokens[i]
            tagged = temp[1]
            if tagged == 'WRB' or tagged == 'WP':
                tagged = 'Q_WORD'
            elif tagged == 'NNP' or tagged == 'NN' or tagged == 'NNS':  #This needs more work   i.e. countries dont work
                tagged = 'SUBJECT'
            elif tagged == 'VB' or tagged == 'VBD' or tagged == 'VBG' or tagged == 'VBN' or tagged == 'VBP' or tagged == 'VBZ':
                tagged = 'Q_DETAIL'
            else:
                tagged = 'NONE'
            tags.append(tagged)
        sub = getSub(tags, t_w)
        print('\nOriginal question :', t_w)
        print('Question Subject :', sub, '\n')
        para = searchWiki(sub)
        if checkExists(para) == False:
            print('This page cannot be found')
            run_refine_menu = True
            while run_refine_menu == True:
                choice = input(
                    'Do you want me to try and refine your search? (or try again?) [Enter "refine" or "again"]'
                )
                if choice.lower() == 'refine':
                    run_refine_menu = False
                    changeFormat(para, sub)
                elif choice.lower() == 'again':
                    run_refine_menu = False

                else:
                    print('That is not a valid choice')
        else:
            print(para)
Ejemplo n.º 7
0
def main():
    text = 'Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in ' \
           'Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great ' \
           'civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are ' \
           'met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final ' \
           'resting place for those who here gave their lives that that nation might live. It is altogether fitting and ' \
           'proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we ' \
           'can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far ' \
           'above our poor power to add or detract. The world will little note, nor long remember what we say here, but ' \
           'it can never forget what they did here. It is for us the living, rather, to be dedicated here to the ' \
           'unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here ' \
           'dedicated to the great task remaining before us -- that from these honored dead we take increased devotion ' \
           'to that cause for which they gave the last full measure of devotion -- that we here highly resolve that ' \
           'these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- ' \
           'and that government of the people, by the people, for the people, shall not perish from the earth.'
    text_tok = tok(text)
    text_pos = pd.DataFrame(tag(text_tok), columns=['words', 'pos tags'])
    text_pos.insert(0, 'key', list(range(len(text_pos.index))))
    print(text_pos)

    repl_pos = {
        'pos tags': [
            'CD', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR',
            'RBS'
        ],
        'pos names': [
            'number', 'adjective', 'comparative adjective',
            'superlative adjective', 'singular noun', 'plural noun',
            'proper noun', 'plural proper noun', 'adverb',
            'comparative adverb', 'superlative adverb'
        ]
    }
    repl_pos = pd.DataFrame(data=repl_pos)

    print(repl_pos)

    text_repl = pd.merge(text_pos, repl_pos, on='pos tags', how='inner')
    text_repl['key'] = text_repl['key'].astype(int)

    print(text_repl)

    sparsity = 7
    repl_num = floor(len(text_repl.index) / sparsity)

    replace = text_repl.sample(n=repl_num)
    new_words = ['_____'] * repl_num
    replace.insert(4, 'new words', new_words)

    print(replace)

    output = ''
Ejemplo n.º 8
0
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):

    words, ents = zip(*tree.pos())
    iobs = []
    prev = None
    for ent in ents:
        if ent == tree.label():
            iobs.append('O')
            prev = None
        elif prev == ent:
            iobs.append('I-%s' % ent)
        else:
            iobs.append('B-%s' % ent)
            prev = ent
    words, tags = zip(*tag(words))
    return zip(words, tags, iobs)
Ejemplo n.º 9
0
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
   

    words, ents = zip(*tree.pos())
    iobs = []
    prev = None
    for ent in ents:
        if ent == tree.label():
            iobs.append('O')
            prev = None
        elif prev == ent:
            iobs.append('I-%s' % ent)
        else:
            iobs.append('B-%s' % ent)
            prev = ent
    words, tags = zip(*tag(words))
    return zip(words, tags, iobs)