def create_spacy_corpus(text_corpus: PlaintextCorpusReader, lang: Language) -> Corpus: data = ((text_corpus.raw(fid), { 'fileid': fid }) for fid in text_corpus.fileids()) corpus = Corpus(lang, data) return corpus
DEMO_DATA_ROOT = "../../../RepositoryData/data" ## Loading Corpus Raw Texts import nltk from nltk.corpus.reader import PlaintextCorpusReader import numpy as np import jieba, re jieba.set_dictionary(DEMO_DATA_ROOT + "/jiaba/dict.txt.big.txt") corpus_dir = DEMO_DATA_ROOT+"/TaiwanPresidentialInaugarationSpeech_en" twp = PlaintextCorpusReader(corpus_dir, ".*\.txt") len(twp.raw()) ## Word Segmentation - Try two methods: `ckiptagger` vs. `jieba` from ckiptagger import WS ```{margin} ```{note} Please remember to download the CKIP model files and change the path accordingly. ``` ``` ws = WS("/Users/Alvin/Dropbox/Corpus/CKIP_WordSeg/data")
def training_data(paths=None, file_count=0): """ Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX> in order to extract the bits of text containing the relevant information and group them into a list Chunk the elements of the list leaving only a tuple reprezented by the type of the entity and its name :param paths the paths towards the file containing the training data :param file_count the number of files to read :return a list of lists where each element is a list formed from the type of the entity and its ful name """ # extract training data from WSJ # pattern : the general pattern of a tag # snd_pattern : the approximate pattern of the desired information from the tag pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII) snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII) # the strings representing the tags extracted from the files text = PlaintextCorpusReader(paths[0], '.*\.txt') data = [] for fid in text.fileids(): data = data + pattern.findall(text.raw(fileids=fid),re.ASCII) # from every tag form the list find the two sub-strings # that correspond to the snd_pattern # use sets to eliminate redundancy raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)])))))) # extract data from names folders del data data = PlaintextCorpusReader(paths[1], '.*') name_data = data.words('names.male') + data.words('names.female') + data.words('names.family') # extract the most common 350 organization tokens organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities)))) organization_specific_tokens = [] for wl in organization_words: organization_specific_tokens += wl organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350))) location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities)))) location_specific_tokens = [] for wl in location_words: location_specific_tokens += wl location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350))) # put the names in a dictionary for quicker access name_dict = {} for n in list(set(name_data + names.words())): if n.lower()[0] in name_dict: name_dict[n.lower()[0]] += [n.lower()] else: name_dict[n.lower()[0]] = [n.lower()] # put the location data in a dictionary for quicker access loc_dict = {} for l in location_specific_tokens[1:]: if l[0] in loc_dict: loc_dict[l[0]] += [l] else: loc_dict[l[0]] = [l] # put the organization data in a dictionary for quicker access org_dict = {} for o in organization_specific_tokens: if o[0] in org_dict: org_dict[o[0]] += [o] else: org_dict[o[0]] = [o] entity_dict1 = { 'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))), 'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))), 'ORGANIZATION': list( map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities)))) } entity_dict2 = {} for l in ['PERSON', 'ORGANIZATION', 'LOCATION']: entity_dict2[l] = {} for e in entity_dict1[l]: if e[0] in entity_dict2[l]: entity_dict2[l][e[0]] += [e] else: entity_dict2[l][e[0]] = [e] return entity_dict2, org_dict, name_dict, loc_dict
file.close() reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stopword_reader = PlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/", r'.*\.txt', encoding='latin-1') stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"]) for file in stopword_reader.fileids(): stops = stopword_reader.raw(file).replace("\n", ",").split(",") for word in stops: stop_words.add(word) # text wrangling functions: def remove_emoji( string ): # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS)
import numpy as np import nltk import pandas as pd from nltk.corpus.reader import PlaintextCorpusReader from sklearn.feature_extraction.text import CountVectorizer mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt") vec = CountVectorizer() indx = 0 lst = [] for i in mycorpus.fileids(): nlst = mycorpus.raw(i) indx = indx + 1 lst.append(nlst) corpus = np.array(lst) #-----------Stop Words--------- vec = CountVectorizer(stop_words="english") vec.fit(corpus) #Sparse matrix X = vec.transform(corpus) bM = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=mycorpus.fileids()).T print(type(corpus)) print(corpus) print(bM) # bM.to_csv('booleanMatrix.csv') # # Jaccards similarity # from sklearn.metrics import jaccard_score # similarity = []
import os directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/" input_directory = directory + "Input/_Product_Management/" output_directory = directory + "1_POS/" if not os.path.exists(output_directory): os.mkdir(output_directory) # reading stuff file_list = os.listdir(input_directory) print file_list # just for testing create a corpus reader from nltk.corpus.reader import PlaintextCorpusReader reader = PlaintextCorpusReader(input_directory,'.*.txt') reader.fileids() reader.raw() reader.sents() reader.words() ## default POS tagger from NLTK ## import nltk # import pprint # sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') pos = "nltk" path = output_directory + pos if not os.path.exists(path): os.mkdir(path) for i in range(len(file_list)): # posting = [] output = path + "/" + str(file_list[i]) jfile=open (output,"w") reader = PlaintextCorpusReader(input_directory,str(file_list[i]))
english_stops = set(stopwords.words('english')) english_stops_nopunct = { stopword.translate(table) for stopword in english_stops } # Load the insect wordlist of stems insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt']) # A list to hold the frequency data freq_data = [] count = 1 # Read each file in turn for file in files: text = reader.raw(file) print(f'{count}: TOKENISING {file}') # Tokenise and normalise to lowercase tokens = word_tokenize(text.lower()) # Remove all punctuation marks tokens_nopunct = [token.translate(table) for token in tokens] # Remove all tokens that are only numbers (or punctuation marks if there were any left) words = [word for word in tokens_nopunct if word.isalpha()] # Remove stopwords from the tokens words_nostops = [ word for word in words if word not in english_stops_nopunct