def training_data(paths=None, file_count=0): """ Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX> in order to extract the bits of text containing the relevant information and group them into a list Chunk the elements of the list leaving only a tuple reprezented by the type of the entity and its name :param paths the paths towards the file containing the training data :param file_count the number of files to read :return a list of lists where each element is a list formed from the type of the entity and its ful name """ # extract training data from WSJ # pattern : the general pattern of a tag # snd_pattern : the approximate pattern of the desired information from the tag pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII) snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII) # the strings representing the tags extracted from the files text = PlaintextCorpusReader(paths[0], '.*\.txt') data = [] for fid in text.fileids(): data = data + pattern.findall(text.raw(fileids=fid),re.ASCII) # from every tag form the list find the two sub-strings # that correspond to the snd_pattern # use sets to eliminate redundancy raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)])))))) # extract data from names folders del data data = PlaintextCorpusReader(paths[1], '.*') name_data = data.words('names.male') + data.words('names.female') + data.words('names.family') # extract the most common 350 organization tokens organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities)))) organization_specific_tokens = [] for wl in organization_words: organization_specific_tokens += wl organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350))) location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities)))) location_specific_tokens = [] for wl in location_words: location_specific_tokens += wl location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350))) # put the names in a dictionary for quicker access name_dict = {} for n in list(set(name_data + names.words())): if n.lower()[0] in name_dict: name_dict[n.lower()[0]] += [n.lower()] else: name_dict[n.lower()[0]] = [n.lower()] # put the location data in a dictionary for quicker access loc_dict = {} for l in location_specific_tokens[1:]: if l[0] in loc_dict: loc_dict[l[0]] += [l] else: loc_dict[l[0]] = [l] # put the organization data in a dictionary for quicker access org_dict = {} for o in organization_specific_tokens: if o[0] in org_dict: org_dict[o[0]] += [o] else: org_dict[o[0]] = [o] entity_dict1 = { 'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))), 'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))), 'ORGANIZATION': list( map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities)))) } entity_dict2 = {} for l in ['PERSON', 'ORGANIZATION', 'LOCATION']: entity_dict2[l] = {} for e in entity_dict1[l]: if e[0] in entity_dict2[l]: entity_dict2[l][e[0]] += [e] else: entity_dict2[l][e[0]] = [e] return entity_dict2, org_dict, name_dict, loc_dict
import pyodbc from random import randint cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=R0224576\RYANSQLSERVER;DATABASE=FAQ;UID=m097654;Trusted_Connection=yes') cursor = cnxn.cursor() data = cursor.execute('select msg from FACT').fetchall() tokens = nltk.word_tokenize(str(data)) text = nltk.Text(tokens) nwords = [w.lower() for w in text if w.isalpha()] text = nltk.Text(nwords) corpus_root='C:\Python_workspace\FAQ Scripts\corpus' newcorpus = PlaintextCorpusReader(corpus_root,'.*') postxt = newcorpus.words('positive-words.txt') negtxt = newcorpus.words('negative-words.txt') neglist = [] poslist = [] for i in range(0,len(negtxt)): neglist.append('negative') for i in range(0,len(postxt)): poslist.append('positive') postagged = zip(postxt,poslist) negtagged = zip(negtxt,neglist) tagged = postagged + negtagged
input_directory = directory + "Input/_Product_Management/" output_directory = directory + "1_POS/" if not os.path.exists(output_directory): os.mkdir(output_directory) # reading stuff file_list = os.listdir(input_directory) print file_list # just for testing create a corpus reader from nltk.corpus.reader import PlaintextCorpusReader reader = PlaintextCorpusReader(input_directory,'.*.txt') reader.fileids() reader.raw() reader.sents() reader.words() ## default POS tagger from NLTK ## import nltk # import pprint # sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') pos = "nltk" path = output_directory + pos if not os.path.exists(path): os.mkdir(path) for i in range(len(file_list)): # posting = [] output = path + "/" + str(file_list[i]) jfile=open (output,"w") reader = PlaintextCorpusReader(input_directory,str(file_list[i])) text = str(reader.raw()) sents = nltk.sent_tokenize(text)
## Corpus example ############################ sample = gutenberg.raw("bible-kjv.txt") sent = sent_tokenize(sample) for x in range(5): print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print(
index = index + 1 yield x, y if __name__ == "__main__": directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/' corpus_dir = directory + 'corpus/' examples = directory + 'examples.txt' vocabulary = directory + 'vocab.txt' w_t = RegexpTokenizer("[\u0980-\u09FF']+") corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t) text_in_words = [] files = corpus.fileids() for f in files: words_in_doc = corpus.words(f) text_in_words.append(words_in_doc) text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words] words = [] for doc in text_in_words: for word in doc: words.append(word) words = sorted(set(words)) print_vocabulary(vocabulary, words) if not os.path.isdir(directory + 'checkpoints/'): os.makedirs(directory + 'checkpoints/') # Try different window sizes vector_model = word2vec.Word2Vec(text_in_words, size = 500, min_count = 1, window = 7)