Beispiel #1
0
 def read_corpus(self):
     """
         assign training and test corpora
     """
     length = len(brown.tagged_sents(tagset='universal'))
     self.train_sents = brown.tagged_sents(
         tagset='universal')[0:int(length * 0.9)]
     self.test_sents = brown.tagged_sents(tagset='universal')[int(length *
                                                                  0.9):]
     # Read data from files
     self.conll_train = conll2000.sents(
         'train.txt')  #only when building semantic space
     self.conll_test = conll2000.sents(
         'test.txt')  #only when building semantic space
def get_noun_phrases_and_named_entities(file_name, start_index, end_index):

    sentences = conll2000.sents(file_name)
    noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP'])
    pos_tagged_sentences = conll2000.tagged_sents(file_name)

    sentences = sentences[start_index:end_index]
    pos_tagged_sentences = pos_tagged_sentences[start_index:end_index]
    noun_phrase_sentences = noun_phrase_sentences[start_index:end_index]

    # Extacting mentions.
    words = []
    cnt = 0
    for sent in sentences:
        cnt += 1
        for word in sent:
            words.append((word, cnt))

    noun_phrases = []
    for sent in noun_phrase_sentences:
        noun_phrases += nltk.chunk.tree2conlltags(sent)

    named_entities = []
    for tagged_sent in pos_tagged_sentences:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    return (words, noun_phrases, named_entities)
    return ((tfidf * tfidf.T).A)[0, 1]


main = open("tfidf-1.txt")
main_file = main.read()

import glob
file_list = glob.glob("/Users/Gourhari/Desktop/TF_IDF/*")

for filename in file_list:
    print("Similarity between " + str(main.name) + " and " + str(filename) +
          " is : " + str(cosine_sim(main_file,
                                    open(filename).read())))

from nltk.corpus import conll2000, conll2002
print(conll2000.sents())
for tree in conll2000.chunked_sents()[:5]:
    print(tree)

# Install: pip install spacy && python -m spacy download en
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

# Process whole documents
text = open('tfidf-1.txt').read()
doc = nlp(text)

# Find named entities, phrases and concepts
for entity in doc.ents:
    print("Corpus is now {0} characters long".format(len(books_raw)))
    print()
    books_raw = books_raw.lower()

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(books_raw)

book_sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        book_sentences.append(sentence_to_wordlist(raw_sentence))

#print(raw_sentences[5])
#print(book_sentences[5])

conll2000_corp_sents = conll2000.sents()
print("condll2000 to sents")
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
Beispiel #6
0
# 	res.write("\n")
	
# 	amount -= 1
# 	amount2 -= 1
# 	if amount2 == 0:
# 		break


#!/usr/bin/env python3
tagged_sentences = []

from nltk.corpus import conll2000 as corpus
tagged_sentences += corpus.tagged_sents(tagset='universal')

import nltk
untagged_sentences += corpus.sents()

data = open("testdata.txt", "wt")
res = open("answer.txt", "wt")
control = open("control.txt", "wt")

tru_amount = len(tagged_sentences)
amount = tru_amount
print("[Data] Extracted {} out of {} ({:.2f}%)".format(amount, tru_amount, amount/tru_amount*100))


for sentences, untagged in zip(tagged_sentences, untagged_sentences):
	# if "" in [a[0] for a in sentences]:
	# 	continue
	tagged = nltk.pos_tag(untagged, tagset='universal')
	for (word, tag), (word2, tag2) in zip(sentences, tagged):
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [
    brown.sents(),
    nps_chat.posts(),
    conll2000.sents(),
    treebank.sents()
]

# language tool spell checker
lt_check = language_check.LanguageTool('en-US')

# pyenchant spell checker
# pe_check = enchant.Dict('en_US')

universal_tagset = [
    '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT',
    'VERB', 'X'
]