def create_spacy_corpus(text_corpus: PlaintextCorpusReader, lang: Language) -> Corpus: data = ((text_corpus.raw(fid), { 'fileid': fid }) for fid in text_corpus.fileids()) corpus = Corpus(lang, data) return corpus
corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml') corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha') # get all the words spoken by a child all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])] # init wordnet and language model corpus_ic = wn.ic(corpus_xml, True, 1.0) lm = LanguageModel(all_words) # collect all the features for each corpus for j in range(len(corpus_xml.fileids())): current_features = [] # init empty array to store features # Text initialization text_xml = corpus_xml.fileids()[j] text_plain = corpus_plain.fileids()[j] # list of words spoken by the child in lowercase child_words_xml = [ w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI']) ] # list of words spoken by the child in lowercase with replaced words child_words_replaced_xml = [ w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI'], replace=True) ] # list of words spoken by the child in lowercase with the stemmed words child_words_stemmed_xml = [ w.lower()
## Word Segmentation - Try two methods: `ckiptagger` vs. `jieba` from ckiptagger import WS ```{margin} ```{note} Please remember to download the CKIP model files and change the path accordingly. ``` ``` ws = WS("/Users/Alvin/Dropbox/Corpus/CKIP_WordSeg/data") ## Print first 200 chars of file 13 print(twp.raw(fileids=twp.fileids()[13])[:200]) # word-seg the raw text and return a long string def tokenize_raw1(raw): word_tok = [' '.join(para) for para in ws(nltk.regexp_tokenize(raw, r'[^\s]+'))] # para-like units raw_tok = ' '.join(word_tok) return raw_tok # word-seg the raw text and return list of words def tokenize_raw2(raw): para_list = nltk.regexp_tokenize(raw, r'[^\s]+') # para-like units word_list = sum(ws(para_list),[]) return word_list def tokenize_raw3(raw):
def training_data(paths=None, file_count=0): """ Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX> in order to extract the bits of text containing the relevant information and group them into a list Chunk the elements of the list leaving only a tuple reprezented by the type of the entity and its name :param paths the paths towards the file containing the training data :param file_count the number of files to read :return a list of lists where each element is a list formed from the type of the entity and its ful name """ # extract training data from WSJ # pattern : the general pattern of a tag # snd_pattern : the approximate pattern of the desired information from the tag pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII) snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII) # the strings representing the tags extracted from the files text = PlaintextCorpusReader(paths[0], '.*\.txt') data = [] for fid in text.fileids(): data = data + pattern.findall(text.raw(fileids=fid),re.ASCII) # from every tag form the list find the two sub-strings # that correspond to the snd_pattern # use sets to eliminate redundancy raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)])))))) # extract data from names folders del data data = PlaintextCorpusReader(paths[1], '.*') name_data = data.words('names.male') + data.words('names.female') + data.words('names.family') # extract the most common 350 organization tokens organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities)))) organization_specific_tokens = [] for wl in organization_words: organization_specific_tokens += wl organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350))) location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities)))) location_specific_tokens = [] for wl in location_words: location_specific_tokens += wl location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350))) # put the names in a dictionary for quicker access name_dict = {} for n in list(set(name_data + names.words())): if n.lower()[0] in name_dict: name_dict[n.lower()[0]] += [n.lower()] else: name_dict[n.lower()[0]] = [n.lower()] # put the location data in a dictionary for quicker access loc_dict = {} for l in location_specific_tokens[1:]: if l[0] in loc_dict: loc_dict[l[0]] += [l] else: loc_dict[l[0]] = [l] # put the organization data in a dictionary for quicker access org_dict = {} for o in organization_specific_tokens: if o[0] in org_dict: org_dict[o[0]] += [o] else: org_dict[o[0]] = [o] entity_dict1 = { 'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))), 'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))), 'ORGANIZATION': list( map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities)))) } entity_dict2 = {} for l in ['PERSON', 'ORGANIZATION', 'LOCATION']: entity_dict2[l] = {} for e in entity_dict1[l]: if e[0] in entity_dict2[l]: entity_dict2[l][e[0]] += [e] else: entity_dict2[l][e[0]] = [e] return entity_dict2, org_dict, name_dict, loc_dict
file.write(status.full_text) file.close() reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stopword_reader = PlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/", r'.*\.txt', encoding='latin-1') stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"]) for file in stopword_reader.fileids(): stops = stopword_reader.raw(file).replace("\n", ",").split(",") for word in stops: stop_words.add(word) # text wrangling functions: def remove_emoji( string ): # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols
from nltk.tokenize import SpaceTokenizer, sent_tokenize, word_tokenize, PunktSentenceTokenizer from nltk.corpus import gutenberg ## Corpus example ############################ sample = gutenberg.raw("bible-kjv.txt") sent = sent_tokenize(sample) for x in range(5): print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents())
class NLTKReader(object): ERROR = 0 WARN = 1 INFO = 2 DEBUG = 3 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0): # Create the cache directory if necessary. if not os.path.exists(cache_dir): os.mkdir(cache_dir) self.cache_dir = cache_dir self._verbosity = verbosity if input is not None: self.input_reader = PlaintextCorpusReader(input, '.*\.txt') else: self.input_reader = None def _log(self, lvl, msg): if lvl <= self._verbosity: print msg def _is_noun(self, word): synonyms = len(wn.synsets(word, NOUN)) self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word)) return synonyms > 0 def _get_cache_file(self, cache_name): return os.path.join(self.cache_dir, cache_name) def _write_cache(self, cache_name, data): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'writing cache to %s' % cache_file) with open(cache_file, 'w') as f: f.write(data) def _cache_exists(self, cache_name): cache_file = self._get_cache_file(cache_name) return os.path.exists(cache_file) def _read_cache(self, cache_name): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'reading cache from %s' % cache_file) return open(cache_file, 'r').read() def _check_text_availability(self, text_name): if text_name not in self.available_texts(): raise Exception('No corpus available named "%s".' % text_name) def _get_reader_for(self, text_name): if text_name in gutenberg.fileids(): return gutenberg else: return self.input_reader def available_texts(self): available = gutenberg.fileids() if self.input_reader is not None: available = available + self.input_reader.fileids() return available def text_report(self): print '%40s %10s %10s' % ('text', 'words', 'sentences') for txt in self.available_texts(): word_count = len(self.get_words(txt)) sent_count = len(self.get_sentences(txt)) print '%40s %10i %10i' % (txt, word_count, sent_count) def get_words_from_text(self, text_name): self._check_text_availability(text_name) words_with_puncuation = self.get_words(text_name) # Strip punctuation and make lower case. words = [ w.lower() for w in words_with_puncuation if w not in string.punctuation and len(w) > 3 ] # Remove duplicate nouns. words = list(set(words)) self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name)) return words def get_nouns_from_text(self, text_name): self._log(self.INFO, '\nGetting nouns from %s' % text_name) cache_name = 'nouns_' + text_name if self._cache_exists(cache_name): nouns = self._read_cache(cache_name).split(',') else: words = self.get_words_from_text(text_name) self._log( self.WARN, 'Noun identification beginning. This might take awhile...') self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words)) tagged_words = pos_tag(words) self._log(self.INFO, 'Extracting all non-nouns based on POS tag...') nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN' ] self._log(self.INFO, '\t%i left' % len(nouns)) self._log( self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...') nouns = [noun for noun in nouns if self._is_noun(noun)] self._log(self.INFO, '\t%i left' % len(nouns)) self._write_cache(cache_name, ','.join(nouns)) self._log(self.INFO, 'Found %i total nouns from %s' \ % (len(nouns), text_name)) return nouns def get_noun_pairs_from_all_texts(self): """Retrieves all nouns from the NLTK corpus of texts.""" singulars = [] for text in self.available_texts(): singulars += self.get_nouns_from_text(text) singulars = list(set(singulars)) return [(singular, plural(singular)) for singular in singulars] def get_words(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).words(text_name) def get_sentences(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).sents(text_name) def get_tagged_sentences(self, text_name, exclude_punctuation=False): for sent in self.get_sentences(text_name): if exclude_punctuation: sent = [word for word in sent if not is_punctuation(word)] yield pos_tag(sent) def get_parts_of_speech(self, text_name, exclude_punctuation=False): self._log( self.INFO, 'Parts of speech extraction beginning. This might take awhile...') pos = set() for sent in self.get_tagged_sentences( text_name, exclude_punctuation=exclude_punctuation): words, parts = zip(*sent) pos.update(parts) # String blanks (not sure why there are blanks, but there are sometimes). return sorted([p for p in pos if p is not '']) def get_tag_descriptions(self): return tag_descriptions def describe_tag(self, tag): if tag not in tag_descriptions.keys(): # Return original tag if we don't know it return (tag, tag) return tag_descriptions[tag]
class NLTKReader(object): ERROR = 0 WARN = 1 INFO = 2 DEBUG = 3 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0): # Create the cache directory if necessary. if not os.path.exists(cache_dir): os.mkdir(cache_dir) self.cache_dir = cache_dir self._verbosity = verbosity if input is not None: self.input_reader = PlaintextCorpusReader(input, '.*\.txt') else: self.input_reader = None def _log(self, lvl, msg): if lvl <= self._verbosity: print msg def _is_noun(self, word): synonyms = len(wn.synsets(word, NOUN)) self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word)) return synonyms > 0 def _get_cache_file(self, cache_name): return os.path.join(self.cache_dir, cache_name) def _write_cache(self, cache_name, data): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'writing cache to %s' % cache_file) with open(cache_file, 'w') as f: f.write(data) def _cache_exists(self, cache_name): cache_file = self._get_cache_file(cache_name) return os.path.exists(cache_file) def _read_cache(self, cache_name): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'reading cache from %s' % cache_file) return open(cache_file, 'r').read() def _check_text_availability(self, text_name): if text_name not in self.available_texts(): raise Exception('No corpus available named "%s".' % text_name) def _get_reader_for(self, text_name): if text_name in gutenberg.fileids(): return gutenberg else: return self.input_reader def available_texts(self): available = gutenberg.fileids() if self.input_reader is not None: available = available + self.input_reader.fileids() return available def text_report(self): print '%40s %10s %10s' % ('text', 'words', 'sentences') for txt in self.available_texts(): word_count = len(self.get_words(txt)) sent_count = len(self.get_sentences(txt)) print '%40s %10i %10i' % (txt, word_count, sent_count) def get_words_from_text(self, text_name): self._check_text_availability(text_name) words_with_puncuation = self.get_words(text_name) # Strip punctuation and make lower case. words = [w.lower() for w in words_with_puncuation if w not in string.punctuation and len(w) > 3] # Remove duplicate nouns. words = list(set(words)) self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name)) return words def get_nouns_from_text(self, text_name): self._log(self.INFO, '\nGetting nouns from %s' % text_name) cache_name = 'nouns_' + text_name if self._cache_exists(cache_name): nouns = self._read_cache(cache_name).split(',') else: words = self.get_words_from_text(text_name) self._log(self.WARN, 'Noun identification beginning. This might take awhile...') self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words)) tagged_words = pos_tag(words) self._log(self.INFO, 'Extracting all non-nouns based on POS tag...') nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN'] self._log(self.INFO, '\t%i left' % len(nouns)) self._log(self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...') nouns = [ noun for noun in nouns if self._is_noun(noun) ] self._log(self.INFO, '\t%i left' % len(nouns)) self._write_cache(cache_name, ','.join(nouns)) self._log(self.INFO, 'Found %i total nouns from %s' \ % (len(nouns), text_name)) return nouns def get_noun_pairs_from_all_texts(self): """Retrieves all nouns from the NLTK corpus of texts.""" singulars = [] for text in self.available_texts(): singulars += self.get_nouns_from_text(text) singulars = list(set(singulars)) return [(singular, plural(singular)) for singular in singulars] def get_words(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).words(text_name) def get_sentences(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).sents(text_name) def get_tagged_sentences(self, text_name, exclude_punctuation=False): for sent in self.get_sentences(text_name): if exclude_punctuation: sent = [ word for word in sent if not is_punctuation(word) ] yield pos_tag(sent) def get_parts_of_speech(self, text_name, exclude_punctuation=False): self._log(self.INFO, 'Parts of speech extraction beginning. This might take awhile...') pos = set() for sent in self.get_tagged_sentences(text_name, exclude_punctuation=exclude_punctuation): words, parts = zip(*sent) pos.update(parts) # String blanks (not sure why there are blanks, but there are sometimes). return sorted([ p for p in pos if p is not '' ]) def get_tag_descriptions(self): return tag_descriptions def describe_tag(self, tag): if tag not in tag_descriptions.keys(): # Return original tag if we don't know it return (tag,tag) return tag_descriptions[tag]
import numpy as np import nltk import pandas as pd from nltk.corpus.reader import PlaintextCorpusReader from sklearn.feature_extraction.text import CountVectorizer mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt") vec = CountVectorizer() indx = 0 lst = [] for i in mycorpus.fileids(): nlst = mycorpus.raw(i) indx = indx + 1 lst.append(nlst) corpus = np.array(lst) #-----------Stop Words--------- vec = CountVectorizer(stop_words="english") vec.fit(corpus) #Sparse matrix X = vec.transform(corpus) bM = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=mycorpus.fileids()).T print(type(corpus)) print(corpus) print(bM) # bM.to_csv('booleanMatrix.csv') # # Jaccards similarity # from sklearn.metrics import jaccard_score # similarity = []
import os directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/" input_directory = directory + "Input/_Product_Management/" output_directory = directory + "1_POS/" if not os.path.exists(output_directory): os.mkdir(output_directory) # reading stuff file_list = os.listdir(input_directory) print file_list # just for testing create a corpus reader from nltk.corpus.reader import PlaintextCorpusReader reader = PlaintextCorpusReader(input_directory,'.*.txt') reader.fileids() reader.raw() reader.sents() reader.words() ## default POS tagger from NLTK ## import nltk # import pprint # sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') pos = "nltk" path = output_directory + pos if not os.path.exists(path): os.mkdir(path) for i in range(len(file_list)): # posting = [] output = path + "/" + str(file_list[i]) jfile=open (output,"w")
x[i, t] = word2idx(w) y[i] = word2idx(next_word_list[index % len(next_word_list)]) index = index + 1 yield x, y if __name__ == "__main__": directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/' corpus_dir = directory + 'corpus/' examples = directory + 'examples.txt' vocabulary = directory + 'vocab.txt' w_t = RegexpTokenizer("[\u0980-\u09FF']+") corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t) text_in_words = [] files = corpus.fileids() for f in files: words_in_doc = corpus.words(f) text_in_words.append(words_in_doc) text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words] words = [] for doc in text_in_words: for word in doc: words.append(word) words = sorted(set(words)) print_vocabulary(vocabulary, words) if not os.path.isdir(directory + 'checkpoints/'): os.makedirs(directory + 'checkpoints/')