def read_corpus(self): """ assign training and test corpora """ length = len(brown.tagged_sents(tagset='universal')) self.train_sents = brown.tagged_sents( tagset='universal')[0:int(length * 0.9)] self.test_sents = brown.tagged_sents(tagset='universal')[int(length * 0.9):] # Read data from files self.conll_train = conll2000.sents( 'train.txt') #only when building semantic space self.conll_test = conll2000.sents( 'test.txt') #only when building semantic space
def get_noun_phrases_and_named_entities(file_name, start_index, end_index): sentences = conll2000.sents(file_name) noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP']) pos_tagged_sentences = conll2000.tagged_sents(file_name) sentences = sentences[start_index:end_index] pos_tagged_sentences = pos_tagged_sentences[start_index:end_index] noun_phrase_sentences = noun_phrase_sentences[start_index:end_index] # Extacting mentions. words = [] cnt = 0 for sent in sentences: cnt += 1 for word in sent: words.append((word, cnt)) noun_phrases = [] for sent in noun_phrase_sentences: noun_phrases += nltk.chunk.tree2conlltags(sent) named_entities = [] for tagged_sent in pos_tagged_sentences: tree = nltk.chunk.ne_chunk(tagged_sent) named_entities += nltk.chunk.tree2conlltags(tree) return (words, noun_phrases, named_entities)
return ((tfidf * tfidf.T).A)[0, 1] main = open("tfidf-1.txt") main_file = main.read() import glob file_list = glob.glob("/Users/Gourhari/Desktop/TF_IDF/*") for filename in file_list: print("Similarity between " + str(main.name) + " and " + str(filename) + " is : " + str(cosine_sim(main_file, open(filename).read()))) from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) for tree in conll2000.chunked_sents()[:5]: print(tree) # Install: pip install spacy && python -m spacy download en import spacy # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load('en') # Process whole documents text = open('tfidf-1.txt').read() doc = nlp(text) # Find named entities, phrases and concepts for entity in doc.ents:
print("Corpus is now {0} characters long".format(len(books_raw))) print() books_raw = books_raw.lower() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences = tokenizer.tokenize(books_raw) book_sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: book_sentences.append(sentence_to_wordlist(raw_sentence)) #print(raw_sentences[5]) #print(book_sentences[5]) conll2000_corp_sents = conll2000.sents() print("condll2000 to sents") conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents()
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time)
# res.write("\n") # amount -= 1 # amount2 -= 1 # if amount2 == 0: # break #!/usr/bin/env python3 tagged_sentences = [] from nltk.corpus import conll2000 as corpus tagged_sentences += corpus.tagged_sents(tagset='universal') import nltk untagged_sentences += corpus.sents() data = open("testdata.txt", "wt") res = open("answer.txt", "wt") control = open("control.txt", "wt") tru_amount = len(tagged_sentences) amount = tru_amount print("[Data] Extracted {} out of {} ({:.2f}%)".format(amount, tru_amount, amount/tru_amount*100)) for sentences, untagged in zip(tagged_sentences, untagged_sentences): # if "" in [a[0] for a in sentences]: # continue tagged = nltk.pos_tag(untagged, tagset='universal') for (word, tag), (word2, tag2) in zip(sentences, tagged):
] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ] corp_sents_untagged = [ brown.sents(), nps_chat.posts(), conll2000.sents(), treebank.sents() ] # language tool spell checker lt_check = language_check.LanguageTool('en-US') # pyenchant spell checker # pe_check = enchant.Dict('en_US') universal_tagset = [ '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X' ]