# fileids_ = corpus_dir + '/rt-polarity*' corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata' cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']} corpus_treatment(corpus_dir) encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data') fileids_ = '^rt-polarity.*' categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader( root=encoded_corpus_dir, cat_map=t_map_, fileids=fileids_, ) pos_words = categorized_plaintext_corpusreader.words(categories=['pos']) pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos']) pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos']) neg_words = categorized_plaintext_corpusreader.words(categories=['pos']) neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg']) neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg']) # NOTE: para views are not working to be looked into later # classification train = pos_words classifier = NaiveBayesClassifier.train(train)
corpus = CategorizedPlaintextCorpusReader(path, DOC_PATTERN, cat_pattern=CAT_PATTERN) def tag_corpus(corpus): return [nltk.pos_tag(sent) for sent in corpus.sents()] tagged_corpus = tag_corpus(corpus) import spacy nlp = spacy.load('en') def spacy_ner(tokenized_sent): doc = nlp(' '.join(tokenized_sent)) for ent in doc.ents: return ent.text, ent.label_ spacy_named_entities = [spacy_ner(sent) for sent in corpus.sents()] spacy_named_entities = [ entity for entity in spacy_named_entities if entity is not None ] spacy_named_entities = list(set(spacy_named_entities)) print(spacy_named_entities)