def categorize_input_query(self,input_query): query_category=OrderedDict([]) input_query=self.replace_punctuation_in_query_string(input_query) phrasal_not_tokenizer = RegexpTokenizer(r'![\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]') word_not_tokenizer = RegexpTokenizer(r'!(\w+[-]*(\w)*)') not_queries_set=set(word_not_tokenizer.tokenize(input_query)) not_queries_set=not_queries_set.union(set(phrasal_not_tokenizer.tokenize(input_query))) string_copy=input_query string_copy = re.sub(r"\".*?\"", "", string_copy) string_copy = re.sub(r"!.*?(\s|$)", "", string_copy) modified_not_words=[] for words in not_queries_set: #removing the not words modified_not_words.append(words[1:]) phrase_tokenizer = RegexpTokenizer(r'[\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]') phrase_queries_set=set(phrase_tokenizer.tokenize(input_query)) phrase_queries_set=phrase_queries_set.difference(set(modified_not_words)) query_category["PHRASE"]=phrase_queries_set query_category["NOT"]=modified_not_words normal_words=string_copy.split() normal_word_set=set(normal_words ) query_category["WORD"]=normal_word_set return query_category
class HashtagMatch: def __init__(self, name_matcher): from nltk.tokenize import RegexpTokenizer self._name_matcher = name_matcher self._hashtag_extract = RegexpTokenizer('(#[A-Za-z][A-Za-z0-9-_]+)') self._at_extract = RegexpTokenizer('(@[A-Za-z][A-Za-z0-9-_]+)') def extract_hashtag(self, text): return self._hashtag_extract.tokenize(text) def extract_at(self, text): return self._at_extract.tokenize(text) def match(self, text): segs = [' '.join(seg) for seg in self.segment(text[1:])] entities = map(self._name_matcher.exact_match, segs) return [e for e in entities if e] def segment(self, text): n = len(text) - 1 count = 2 ** n sequences = map(lambda x: bin(x)[2:].zfill(n), range(count)) segmentations = [] for s in sequences: segmentation = [] begin = 0 for i in range(n): end = i + 1 if s[i] == '1': segmentation.append(''.join(text[begin:end])) begin = end segmentation.append(''.join(text[begin:end + 1])) segmentations.append(segmentation) return segmentations
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is', 'have', 'are', 'were', 'and', 'very', '.', ',']) self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied']) # Try to tokenize so that abbreviations, monetary amounts, email # addresses, URLs are single tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([\w.@:/])+|\w+|\$[\d.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set(lemmatize(token) for token in self.text_tokens) self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
class StringSpellchecksFinder(object): """ Compares two strings, finding words that been """ def __init__(self, similarity=0.7): self.tokenizer = RegexpTokenizer('[\w-]+') self.similarity = similarity def find(self, text_before, text_after): """ Finds all spellchecks tuple(mistake, correction) in the given text """ spellchecks = [] text_before_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_before)) text_after_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_after)) diff_matching = SequenceMatcher(None, text_before_tokens, text_after_tokens) for difference in filter(lambda x: x[0] == 'replace', diff_matching.get_opcodes()): sequence_before = text_before_tokens[difference[1]:difference[2]] sequence_after = text_after_tokens[difference[3]:difference[4]] spellchecks += self.find_best_match(sequence_before, sequence_after) return spellchecks def find_best_match(self, sequence_before, sequence_after): """ Finds the best matching of elements pairs that are most probable pairs """ pairs = [] possibilities = map(lambda element1: map(lambda element2: (element1, element2, SequenceMatcher(None, element1, element2).ratio()) , sequence_after) , sequence_before) for possibility in possibilities: possibility = filter(lambda p: p[2] >= self.similarity, possibility) if possibility: possibility.sort(key=lambda p: p[2], reverse=True) pairs.append((possibility[0][0], possibility[0][1])) return pairs
def parse_questions(self): stemmer = PorterStemmer() tokenizer = RegexpTokenizer(r'\w+') for questions_key in self.rawSamples: # Stem the Question Text question_text = self.rawSamples[questions_key][0] words_array = tokenizer.tokenize(question_text) question_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) question_text += (word + " ") self.rawSamples[questions_key][0] = question_text # Stem the topic names topics_text = self.rawSamples[questions_key][2] words_array = tokenizer.tokenize(topics_text) topics_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) topics_text += (word + " ") self.rawSamples[questions_key][2] = topics_text
def __init__(self, rtepair, stop=True, lemmatize=False): """ @param rtepair: a L{RTEPair} from which features should be extracted @param stop: if C{True}, stopwords are thrown away. @type stop: C{bool} """ self.stop = stop self.stopwords = set( ["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","] ) self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"]) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+") # Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def getData(): tokenizer = RegexpTokenizer(r'\w+') f = open("msr_paraphrase_train.txt", "r") f.readline() trainInput = [] trainClass = [0] * 8160 i = 0 while i < 8160: tokens = f.readline().strip().split('\t') trainClass[i] = trainClass[i+1] = int(tokens[0]) i += 2 S = tokenizer.tokenize(tokens[3].lower()) Smatrix1 = sentenceToMatrix(S) S = tokenizer.tokenize(tokens[4].lower()) Smatrix2 = sentenceToMatrix(S) trainInput.append([np.transpose(Smatrix1+Smatrix2)]) trainInput.append([np.transpose(Smatrix2+Smatrix1)]) f.close() f = open("msr_paraphrase_test.txt", "r") f.readline() testInput = [] testClass = [0] * 1725 for i in range(0,1725): tokens = f.readline().strip().split('\t') testClass[i] = int(tokens[0]) S = tokenizer.tokenize(tokens[3].lower()) Smatrix = sentenceToMatrix(S) S = tokenizer.tokenize(tokens[4].lower()) Smatrix.extend(sentenceToMatrix(S)) testInput.append([np.transpose(Smatrix)]) f.close() return trainInput, trainClass, testInput, testClass
def __init__(self, rtepair, stop=True, lemmatize=False): """ @param rtepair: a L{RTEPair} from which features should be extracted @param stop: if C{True}, stopwords are thrown away. @type stop: C{bool} """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'have', 'is', 'are', 'were', 'and', 'very', '.',',']) self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def get_outbreak_countries(disease=all): tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+') countries = [] if disease == all: for location in Location.objects.all(): country = tokenizer.tokenize(location.name) country = country[len(country)-1] if country not in countries: countries.append(str(country)) else: for tweet in Tweet.objects.filter(disease_type__contains=disease): if tweet.location: country = tokenizer.tokenize(tweet.location.name) country = country[len(country)-1] country_disease_count = [str(country), \ len(Tweet.objects.filter(disease_type__contains=disease, \ location_string__contains=country)), disease] if country_disease_count not in countries: countries.append(country_disease_count) return countries
def demo(): # from nltk.corpus import brown # from nltk.probability import LidstoneProbDist, WittenBellProbDist # estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("[\w']+") lm = NgramcModel(5) print lm sent = "Like a bridge over troubled water, I will lay it down." print sent words = tokenizer.tokenize(sent) print "Entropy: ", lm.entropy(words) sent = "over twenty year and he" print sent words = tokenizer.tokenize(sent) print "Entropy: ", lm.entropy(words) sent = "over twenty years and he" print sent words = tokenizer.tokenize(sent) print "Entropy: ", lm.entropy(words) print lm.getBetter(["men" ,"are" ,"imporant" ,"for" ,"the"], ["men" ,"are" ,"important" ,"for" ,"the"])
def stopWordRemoval() : f = open('repos', 'r') strn = f.read() lst = strn.split('\n') i = 0 while i < (len(lst) - 1) : name = lst[i].split("/") dummyFile = 'filteredData/' + name[1] + '/dummy.txt'; dr = os.path.dirname(dummyFile) if not os.path.exists(dr) : os.makedirs(dr) ft = open('data/'+name[1]+'/title.txt') st = ft.read().lower() fd = open('data/'+name[1]+'/description.txt') sd = fd.read().lower() fc = open('data/'+name[1]+'/content.txt') sc = fc.read().lower() tokenizer = RegexpTokenizer(r'\w+') wordArrTitle = tokenizer.tokenize(st) wordArrDesc = tokenizer.tokenize(sd) wordArrData = tokenizer.tokenize(sc) filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')] filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')] filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')] wordnet_lem= WordNetLemmatizer() ftf = open('filteredData/'+name[1]+'/title.lst','w') for w in filteredWordsTitle: #print w ftf.write(wordnet_lem.lemmatize(w)+'\n') fdf = open('filteredData/'+name[1]+'/description.lst','w') for w in filteredWordsDesc: #print w fdf.write(wordnet_lem.lemmatize(w)+'\n') fcf = open('filteredData/'+name[1]+'/content.lst','w') for w in filteredWordsData: print w+'\n' fcf.write(wordnet_lem.lemmatize(w)+'\n') i=i+2
def average_sentence_length(text): tokenizer = RegexpTokenizer(r' ([A-Z][^\.!?]*[\.!?])') sentences = tokenizer.tokenize(text) s = np.zeros(len(sentences)) for inds, sentence in enumerate(sentences): tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) s[inds] = len(tokens) return s, np.mean(s), np.std(s)
def _generate_answer_question_pair(self, question, article, X_train_words, Y_train_words, max_seqlen, max_queslen): tokenizer = RegexpTokenizer(r'\w+') answer = re.split(r'\t+', question)[1] question_txt = tokenizer.tokenize(question)[1:-2] ref = int(re.split(r'\t+', question)[-1]) - 1 seq = tokenizer.tokenize(article[ref])[1:] + question_txt if len(seq) > max_seqlen: max_seqlen = len(seq) X_train_words.append(seq) Y_train_words.append(answer) return max_seqlen, max_queslen
def calculate_freqs(data, toExclude): # lemmatizer = WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words("english") sents = nltk.tokenize.sent_tokenize(data) tokenizer = RegexpTokenizer(r"\w+\'?\w+") # tagged_sentences = [w for s in sents for w in nltk.pos_tag(word_tokenize(s))] # words = [lemmatizer.lemmatize(w[0].lower(), get_wordnet_pos(w[1])) for w in tagged_sentences] # if w.lower() not in stopwords] if toExclude: words = [w for s in sents for w in tokenizer.tokenize(s) if w.lower() not in stopwords] else: words = [w for s in sents for w in tokenizer.tokenize(s)] return words
def parse_document(filename,query): myfile = codecs.open(filename,"r","utf-8") raw = myfile.read() sentences = sent_tokenize(raw) tokenizer = RegexpTokenizer(r'\w+') #tokenizer.tokenize(sentences[0]) stop = stopwords.words('english') sents = [[token.lower() for token in tokenizer.tokenize(sentence) if not(token in stop or token.isdigit())] for sentence in sentences] query_t = [token for token in tokenizer.tokenize(query) if not(token in stop or token.isdigit())] cloud = " ".join(list(itertools.chain(*sents))) return cloud,query_t
def clean_data(data): punctuations = list(string.punctuation) data = data.replace("\n"," ").replace(":", " ").replace(",","").replace(".","").replace("'s","").replace("?","") stemmer = PorterStemmer() stemmer2 = SnowballStemmer('english') tokenizer = RegexpTokenizer(r'\w+') tokenizer.tokenize(data) ndata1 = list(mysplit(data)) ndata1 = [[stemmer.stem(xi) for xi in y.split(" ")] for y in ndata1] ndata1 = [[stemmer2.stem(xi) for xi in y] for y in ndata1] ndata = [x for x in ndata1 if not x == ":"] ndata = [filter(None, x) for x in ndata] ndata = [x for x in ndata if x != []] return ndata
def map(self): mc=MongoClient('ec2-52-0-148-244.compute-1.amazonaws.com',27017) dbmc=mc.genid idoc=dbmc.gentable.find_one_and_update(filter={},update={ "$inc": { "score": 1 } },upsert=True); k=Key(self.bucket) y=stopwords.words('english') i=1 strx=str(int(idoc['score'])) strz=None filestring="" for line in sys.stdin: line = unicode(line, "utf-8","ignore") pattern = re.compile(r'\b(' + r'|'.join(y) + r')\b\s*') line = pattern.sub('', line) tokenizer = RegexpTokenizer(r'\w+') words=tokenizer.tokenize(line) strz=strx+'a'+str(i) k.key=strz filestring=line+'\n' k.set_contents_from_string(filestring) for word in words: word=word.encode(encoding='UTF-8',errors='ignore') print '%s\t%s' % (word.strip(), strz) i+=1
def generate_stemmed_tokens(page_content): lowered = page_content.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(lowered) stems = create_stems(tokens) return stems
def createLDAModel(texts, n_topics, n_passes): """Generates a LDA model from an array of texts """ tokenizer = RegexpTokenizer(r'\w+') #Create EN stop words list en_stop = get_stop_words('en') #Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() texts_ = [] # loop through document list for i in texts: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts_.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts_) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts_] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes) return(ldamodel)
def tokenize(self, doc): ''' use NLTK RegexpTokenizer ''' tokenizer = RegexpTokenizer("\w{3,}") return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
def lda(data): data = get_only_text(data) only_tweet = data length = len(only_tweet) length = min(20,length) for i in xrange(0,length): print i print only_tweet[i] return tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() length = len(only_tweet) length = min(20,length) total_texts = [] for i in xrange(0,length): print only_tweet[i] print to_lower = only_tweet[i].lower() tokens = tokenizer.tokenize(to_lower) stopped_tokens = [k for k in tokens if not k in en_stop] texts = [p_stemmer.stem(k) for k in stopped_tokens] total_texts.append(texts) dictionary = corpora.Dictionary(total_texts) corpus = [dictionary.doc2bow(text) for text in total_texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) result = ldamodel.print_topics(num_topics=2, num_words=1) for i in result: print i
def Tokenize(TextData): tokenizer = RegexpTokenizer(r'\w+') tokens = list() # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # clean and tokenize document string raw = TextData.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] tokens = stemmed_tokens TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt") fp = open(TOKENIZEDTEXT_FILE, "w") print(TOKENIZEDTEXT_FILE) # pickle.dump(tokens, fp) fp.write(str(tokens)) fp.close()
def run(self, data): results = [] tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True) for corpus in data: corpus.contents = " ".join(tokenizer.tokenize(corpus.contents)) results.append(corpus) return results
def trainMarkovChain(self, n = 1): self.ngram_degree = n self.markov_model = defaultdict(lambda : defaultdict(int)) sentences = self.corpus_sentences if sentences is None: sentences = self.sentenceTokenizeCorpus() print("Training markov model on corpus.") word_tokenizer = RegexpTokenizer(r"\w+") for sentence in sentences: words = word_tokenizer.tokenize(sentence) last_word_list = ["#"] * n for word in words: last_token = " ".join(last_word_list) self.markov_model[last_token][word] += 1 last_word_list.append(word) last_word_list = last_word_list[1:] last_token = " ".join(last_word_list) self.markov_model[last_token]["#"] += 1
def text_process(text): ''' Takes in a string of text, then performs the following 1. Tokenizes and removes punctuation 2. Removes stopwords 3. Stems 4. Returns a list of the cleaned text ''' if(pd.isnull(text)): return [] # Tokenize tokenizer = RegexpTokenizer(r'\w+') text_processed = tokenizer.tokenize(text) # Removing any stopwords text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')] # Stemming porterStemmer = PorterStemmer() text_processed = [porterStemmer.stem(word) for word in text_processed] try: text_processed.remove('b') except: pass return " ".join(text_processed)
def parse_raw_data(self, new_art): self.startClass=default_timer() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(new_art.body) stemmer = LancasterStemmer() article_dic = new_art.words global_dic = self.raw_dictionary for word in tokens: word = word.lower() if(False == self.is_stop_word(word) and word.isnumeric()==False): s_word = stemmer.stem(word) # s_word = word ## it is not a stop word, check if the word ## is already part of the article dictionary. ## if yes, increment the count else add it. ## If you are adding check if it is part of ## the big corpus, if yes increment the count ## of number of articles with that word. self.globalWordCount+=1 new_art.doc_len = new_art.doc_len + 1 if(s_word in article_dic): article_dic[s_word].wrd_count+=1 global_dic[s_word].wrd_count+=1 else: article_dic[s_word] = local_word_attributes(1) if (s_word in global_dic): global_dic[s_word].art_count+=1 global_dic[s_word].wrd_count+=1 else: global_dic[s_word] = global_word_attributes(1,1, 1, 0)
class Categorizer: def __init__(self,pathToModel,features): #initialize categorizer with model. self.tokenizer = RegexpTokenizer('[A-Za-z]\w+') fp = open(pathToModel,"r") fpf = open(features,"r") model = pickle.load(fp) self.features = pickle.load(fpf) fp.close() fpf.close() self.classifierNB = model def classify(self,text): featureSet = self.naiveFeatures(text) #print featureSet labels = self.classifierNB.classify(featureSet) labelsProbDist = self.classifierNB.prob_classify(featureSet) return labels def naiveFeatures(self,vid,train=False): vidTokens = self.tokenizer.tokenize(vid.lower().strip()) vid = set(vidTokens) #print vid features = {} if train: for word in self.features: features[word] = (word in vid) else: for word in vid: features[word] = (word in self.features) return features
def write_summary(texts, ofile): word_tokenizer = RegexpTokenizer(r"\w+") with codecs.open(ofile, u"w", u"utf-8") as f: for text in texts: f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)])) f.write(u"\n") f.flush()
def textToWordList(txt): p_stemmer = RussianStemmer() tokenizer = RegexpTokenizer(r'\w+') stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')] r = re.compile('^[а-я]+$') badword =[ 'дом', 'город', "дорог", "час", "ноч", "слов", "утр", "стран", "пут", "путешеств", "мест", 'нов', "друз", "добр" ] txt = txt.lower().replace("<br>", "\n") tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)] tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword] return tokens
def preprocess(sentence): sentence = sentence.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) filtered_words = [w for w in tokens if not w in stopwords.words('english')] #filtered_words = filter(lambda token: token not in stopwords.words('english')) return " ".join(filtered_words)
#!/usr/bin/env python import sys import pickle from nltk.tokenize import RegexpTokenizer # xzcat dev-0/in.tsv.xz | python3 ./predict.py > dev-0/out.tsv weights, word_to_index_mapping = pickle.load(open('model.pkl', 'rb')) tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b') for line in sys.stdin: document = line.rstrip() terms = tokenizer.tokenize(document) y_p = weights[0] for term in terms: if term in word_to_index_mapping: y_p += weights[word_to_index_mapping[term]] print(y_p)
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) lmtzr = WordNetLemmatizer() driver = QABase() # question["qid"] returns the form: "fables-04-7" q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) ############################################# # if question["qid"] == 'blogs-03-1': # print(question["text"]) # print(sent_tokenized_text[0]) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ stopwords = set(nltk.corpus.stopwords.words("english")) if (question["difficulty"] == 'Easy'): if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) text = story["sch"] text = nltk.sent_tokenize(text) else: sentences = get_sentences(current_story["text"]) text = story["text"] text = nltk.sent_tokenize(text) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question) # print(question["qid"], best_idx) # tokenize questions, also removing punctuations to extract keywords tokenizer = RegexpTokenizer(r'\w+') tokenized_question_text = tokenizer.tokenize(question["text"]) tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text) # remove stopwords tagged_keywords_list = [] for word, tag in tagged_tokenized_question_text: if word not in stopwords: tagged_keywords_list.append((word, tag)) # lemmatize keywords lemmatized_keywords_list = [] for keyword, tag in tagged_keywords_list: lemmatized_keywords_list.append(stemmer.stem(keyword)) ##################################################### # if question["qid"] == 'fables-04-6': # print("text:", text) # print("best index:", best_idx) # print("qid:", question["qid"]) # print(text[best_idx]) # print("==============================") # print(get_sentences("".join(text))) ##################################################### best_sent = get_sentences(text[best_idx]) # Find the sentences that have all of our keywords in them # Last time, 2nd arg is sentences = get_sentences(text) which returns tuple of each word target_sentences = find_sentences(lemmatized_keywords_list, best_sent) # Extract the candidate locations from these sentences candidates_forest = find_candidates(target_sentences, chunker, question["text"]) if len(candidates_forest) == 0: answer = doBaseline(question, story) else: possible_answers_list = [] # locations is a list of trees for candidate in candidates_forest: # candidate.draw() possible_answers_list.append(" ".join([token[0] for token in candidate.leaves()])) answer = " ".join(possible_answers_list) ########################################### # currently, possible_answer contains the actual needed answer, # plus some garbage words around it from chunking, # we might be able to filter this out SOMEHOW # possible_answer is a list of strings ########################################### elif question["difficulty"] == 'Medium': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words, question) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR)") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") subtree1 = pattern_matcher(pattern, tree) ############################################ # if question["qid"] == 'blogs-03-13': # print("subtree1") # print(subtree1) ############################################ if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: # create a new pattern to match a smaller subset of subtrees if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP)") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) if subtree2 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': answer = "yes" elif question["difficulty"] == 'Hard': answer = "h" elif question["difficulty"] == 'Discourse': answer = "h" else: ######################################### answer = doBaseline(question, story) # answer = "doBaseline" ######################################### ### End of Your Code ### return answer
def tfidf(article, articles): txt_file = open(r"C:\Users\Bratislav\Desktop\petnica projekat\data\train set.txt", "r+", encoding= "utf-8-sig") article = txt_file.readlines()[7] article = article[article.find(" ")+11:] tokenizer = RegexpTokenizer(r'\w+') corpus = nltk.corpus.stopwords.words('english') stemmer = PorterStemmer() sentences = nltk.sent_tokenize(article) # print(words) lemmatizer = WordNetLemmatizer() # tagger = WordNetTagger() syns = [] no_lemmas = [] lemmas = [] a = 0 b = 0 c = 0 pos = 0 neg = 0 obj = 0 for sentence in sentences: # print(sentence) pos1, neg1, obj1 = 0, 0, 0 syns = [] words = tokenizer.tokenize(sentence) no_stop = no_stopwords_func(words) # print(no_stop) # print(ws) tokens = nltk.pos_tag(no_stop) # print(token) for token in tokens: lemma = lemmatizer.lemmatize(token[0]) stem = stemmer.stem(lemma) stemmed # if obj1 == max(pos1, neg1, obj1): # print("a") # if obj1 == pos1: # pos += 1 # print("p") # elif obj1 == neg1: # neg += 1 # print("n") # else: # obj += 1 # elif pos1 == max(pos1, neg1, obj1): # print("b") # if pos1 == neg1: # obj += 1 # print("o") # else: # pos +=1 # elif neg1 == max(pos1, neg1, obj1): # print("c") # if neg1 == pos1: # obj += 1 # print("o") # else: # neg += 1 # print("n") # print(pos, neg, obj) # for synset in syns: # pos, neg, obj = 0, 0, 0 # pos += synset.pos_score() # neg += synset.neg_score() # obj += synset.obj_score() # print(obj, pos, neg) # print(pos, neg, obj) # # print(syns) # print(no_lemmas) # print(a, b, c) # # print(swn.senti_synset("Bad.n.01"))
def essay_grader(f_name, data, topic): wrong = 0 sent_count = 0 sentences = sent_tokenize(data) result = "" for sentence in sentences: sent_count += 1 from nltk.tokenize import word_tokenize from nltk.tokenize import RegexpTokenizer spelling_error = 0 # Regexptokenizer is used for tokenizing effectively tokenizer = RegexpTokenizer('[A-Za-z0-9\']+') tk1 = tokenizer.tokenize(data) result = "" for token in tk1: result += "[" + token + "] " ########################### b. Spelling mistakes ################################## from nltk import pos_tag # Making use of two dictionaries using pyenchant to compare spellings d_US = enchant.Dict("en_US") d_UK = enchant.Dict("en_UK") tagged_tokens = pos_tag(tk1) result = "" spelling_error = 0 serror = [] # This is done to make sure that the proper noun is not considered as a spelling error crosscheck = ['NNP', 'NNPS'] # Checking the spelling error for each word in the essay for token in tagged_tokens: result += '[' + token[0] + '/' + token[1] + ']' flag = 0 for ind, tag_val in enumerate(crosscheck): if (token[1] == crosscheck[ind]): flag = 1 if flag != 1: val_US = d_US.check(token[0]) val_UK = d_UK.check(token[0]) if (val_US == False and val_UK == False): serror.append(token[0]) spelling_error += 1 ########################## c.(i) Subject verb agreement ######################## # Here we check for the most common type of mistake which is the mistake of this and these gramm_mist = 0 for k in sentences: tokens = word_tokenize(k) for i, j in enumerate( tokens): #for a given set of tokens in a given sentence if j == 'this': list_temp = nltk.tag.pos_tag([tokens[i + 1]]) for tag in list_temp: if (tag[1] == 'NNS'): gramm_mist += 1 if j == 'these': list_temp1 = nltk.tag.pos_tag([tokens[i + 1]]) for tag in list_temp1: if (tag[1] == 'NN'): gramm_mist += 1 tokens = word_tokenize(data) result = "" for token in tokens: result += "[" + token + "] " from nltk import pos_tag tagged_tokens = pos_tag(tokens) result = "" # In this case we check for subject verb agreement using different pairs of tags # We are detecting whether the user has entered comma or not by using pairs of tags that cannot come together without a comma # We also found that two determiners cannot be together crosscheck = [ 'NNP VBP', 'MD VBN', 'DT DT', 'DT VBP', 'DT VB', 'DT PRP', 'MD VBD', 'JJS PRP' ] previousTag = '.' previousWord = '' pairs_mtake = 0 for token in tagged_tokens: result += '[' + token[0] + '/' + token[1] + ']' previousTag_tag = previousTag + ' ' + token[1] previousTag = token[1] previousWord_word = previousWord + ' ' + token[0] previousWord = token[0] # The bigram pos pairs are checked with the pairs in the crosschecked list flag = 0 for ind, tag_val in enumerate(crosscheck): if (previousTag_tag == crosscheck[ind]): flag = 1 pairs_mtake += 1 pos_gramm_mistakes = pairs_mtake + gramm_mist pos_mist.append(pos_gramm_mistakes) ################# c.(ii) - Detecting missing verbs and tense mistakes ########### verb_mist = 0 #individual sentences in the list for k in sentences: doc = nlp(k) str = "" #tokenize individual senteneces for token in doc: str = str + token.pos_ + " " if str.find("VERB") == -1: verb_mist += 1 # In this case we check tense mistakes and missing verbs by making doing a crosscheck of pairs crosscheck = ['NNP VBP', 'NNS VBZ', 'VBZ NNP', 'VBP NNP'] previousTag = '.' previousWord = '' tense_mtake = 0 # Pairs of tokens are checked each time by making use of crosscheck array in order to find mistakes in pairs for token in tagged_tokens: result += '[' + token[0] + '/' + token[1] + ']' previousTag_tag = previousTag + ' ' + token[1] previousTag = token[1] previousWord_word = previousWord + ' ' + token[0] previousWord = token[0] flag = 0 for ind, tag_val in enumerate(crosscheck): if (previousTag_tag == crosscheck[ind]): flag = 1 tense_mtake += 1 verb_tensemist = verb_mist + tense_mtake vb_mist.append(verb_tensemist) ########################## c.(iii) Sentence Formation ###################################### error_frag = 0 for k in sentences: output = nlp1.annotate(k, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) sbar_flag = 0 s_flag = 0 if (count <= 83): for i, p in enumerate([ s['parse'] for s in output['sentences'] ]): #Returns a parse tree for a particular sentence index_s = p.find('(S') if (p[index_s + 2] == '\n' or p[index_s + 2] == ' '): s_flag = 1 index_sbar = p.find('SBAR') if (p[index_sbar + 4] == " " or p[index_sbar + 4] == "\n"): sbar_flag = 1 if "FRAG" in p: if (sbar_flag == 1 and s_flag == 0): #print(p) error_frag += 1 ############################ d.(i) Is the essay coherent? ####################################### tokenizer = RegexpTokenizer('[A-Za-z0-9\']+') sentences = sent_tokenize(data) prev_sent = "" for ind, s in enumerate(sentences): if (ind != 0): tk1 = tokenizer.tokenize(s) tagged_tokens = pos_tag(tk1) for token in tagged_tokens: if (token[1] == 'PRP' or token[1] == 'PRP$'): #Looking for pronouns in 3rd person if (token[0].casefold() not in list1 and token[0] not in f_list1): f_list1.append(token[0]) prev_sent = sentences[ind - 1] utterances = s #Current sentence context = prev_sent #Previous sentence for conflict resolution clusters = coref.one_shot_coref(utterances, context) most = coref.get_most_representative( ) #Generates links between context and utterance most1 = repr(most) for x in f_list1: if x not in most1: #print("%s\n" %context) #print("%s\n" %utterances) #print("%s\n\n\n" %x) wrong += 1 break f_list1.clear() c_list.append(wrong) coref_mist = wrong wrong = 0 ############################ d.(ii) Does the essay stay on topic ####################################### tk1 = tokenize(topic) tagged_tokens = pos_tag(tk1) new_top = '' #Here I check for all the noun occurences in the essay for token in tagged_tokens: if (token[1] == 'NNS' or token[1] == 'NN' or token[1] == 'NNP' or token[1] == 'NNPS'): new_top = new_top + token[0] + " " nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} dic = {} #I use wordnet to find the main words in the topic which are used later to find similar words in essay for i, k in enumerate(topic.split()): synonyms = [] for syn in wordnet.synsets(k): for l in syn.lemmas(): synonyms.append(l.name()) dic.update({i: set(synonyms)}) word_set = set() #Now using the synonyms of the words in the topics I find the match in the essay for k in data.split(' '): #Each of the words in essay for val in dic: if (k in dic[val] and k in nouns): word_set.add(k) for i, k in enumerate(new_top.split()): synonyms = [] for syn in wordnet.synsets(k): for l in syn.lemmas(): synonyms.append(l.name()) dic.update({i: set(synonyms)}) for k in data.split(' '): #Each of the words in essay for j in new_top.split(' '): if (k == j): word_set.add(k) new_set = set() for k in word_set: if (k != ''): new_set.add(lemmatizer.lemmatize(k)) #print(len(new_set)) #The length gives the number of words that are related to the topic ess_coher = len(new_set) coher.append(len(new_set)) ################################################################################ scores(f_name, sent_count, spelling_error, serror, pos_gramm_mistakes, verb_tensemist, error_frag, ess_coher, coref_mist)
visited = [] #import ntpath import os import glob os.chdir('stateoftheunionaddresses') for fil in glob.glob("*.txt"): v.append(fil) corpus_root = ('stateoftheunionaddresses') for stateoftheunionaddresses in os.listdir(corpus_root): file = open(os.path.join(corpus_root, stateoftheunionaddresses), "r") doc = file.read() b = sorted((stopwords.words('english'))) tokenizer = RegexpTokenizer(r'[a-zA-Z]+') tokens = tokenizer.tokenize(doc) tokens = [w for w in tokens if not w.lower() in b] #print(v[f]) for p in range(len(tokens)): stemmer = PorterStemmer() tokens[p] = stemmer.stem(tokens[p]) for j in range(len(tokens)): fx.append(tokens[j]) jk.append(len(tokens)) count = Counter(tokens)
def loadData(filename): global uniqueWords, wordcodes, wordcounts override = True if override: #... for debugging purposes, reloading input file and tokenizing is quite slow #... >> simply reload the completed objects. Instantaneous. fullrec = pickle.load(open("w2v_fullrec.p", "rb")) wordcodes = pickle.load(open("w2v_wordcodes.p", "rb")) uniqueWords = pickle.load(open("w2v_uniqueWords.p", "rb")) wordcounts = pickle.load(open("w2v_wordcounts.p", "rb")) logging.debug("len_unk: {} code: {}".format(wordcounts['UNK'], wordcodes['UNK'])) return fullrec #... load in the unlabeled data file. You can load in a subset for debugging purposes. handle = open(filename, "r", encoding="utf8") fullconts = handle.read().split("\n") fullconts = [ entry.split("\t")[1].replace("<br />", "") for entry in fullconts[1:(len(fullconts) - 1)] ] #... apply simple tokenization (whitespace and lowercase) fullconts = [" ".join(fullconts).lower()] print("Generating token stream...") #... (TASK) populate fullrec as one-dimension array of all tokens in the order they appear. #... ignore stopwords in this process #... for simplicity, you may use nltk.word_tokenize() to split fullconts. #... keep track of the frequency counts of tokens in origcounts. stop_words = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(fullconts[0]) fullrec = list(filter(lambda x: x not in stop_words, words)) logging.debug("fullrec: {}".format(fullrec[:100])) min_count = 50 origcounts = Counter(fullrec) print("Performing minimum thresholding..") #... (TASK) populate array fullrec_filtered to include terms as-is that appeared at least min_count times #... replace other terms with <UNK> token. #... update frequency count of each token in dict wordcounts where: wordcounts[token] = freq(token) fullrec_filtered = list( map(lambda x: x if origcounts[x] >= min_count else 'UNK', fullrec)) logging.debug("fullrec_filtered: {}".format(fullrec_filtered[:100])) #... after filling in fullrec_filtered, replace the original fullrec with this one. fullrec = fullrec_filtered wordcounts = Counter(fullrec) print("Producing one-hot indicies") #... (TASK) sort the unique tokens into array uniqueWords #... produce their one-hot indices in dict wordcodes where wordcodes[token] = onehot_index(token) #... replace all word tokens in fullrec with their corresponding one-hot indices. uniqueWords = list(set(fullrec_filtered)) wordcodes = {w: i for i, w in enumerate(uniqueWords)} #logging.debug("wordcodes: {}".format(wordcodes)) logging.debug("len_unk: {} code: {}".format(wordcounts['UNK'], wordcodes['UNK'])) fullrec = list(map(lambda x: wordcodes[x], fullrec)) #logging.debug("fullrec to indices: {}".format(fullrec)) #... close input file handle handle.close() #... store these objects for later. #... for debugging, don't keep re-tokenizing same data in same way. #... just reload the already-processed input data with pickles. #... NOTE: you have to reload data from scratch if you change the min_count, tokenization or number of input rows pickle.dump(fullrec, open("w2v_fullrec.p", "wb+")) pickle.dump(wordcodes, open("w2v_wordcodes.p", "wb+")) pickle.dump(uniqueWords, open("w2v_uniqueWords.p", "wb+")) pickle.dump(dict(wordcounts), open("w2v_wordcounts.p", "wb+")) #... output fullrec should be sequence of tokens, each represented as their one-hot index from wordcodes. return fullrec
def getfeature(self, tweet): text = tweet["text"] feature = [] words = nltk.word_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') word_nopunc = tokenizer.tokenize(text) word_nopunc = [i for i in word_nopunc if i not in stop] # top 20 features using word2vec for i in word_nopunc: if i in model.wv: feat_list = model.wv[i].tolist() feature.extend(feat_list[:20]) #append 0 if no feature found if (len(feature) < 100): for i in range(len(feature), 101): feature.append(0) feature = feature[:100] # Has question marks if text.find('?') > 0: feature.append(1) else: feature.append(0) # has ! if text.find('!') > 0: feature.append(1) else: feature.append(0) # has hastag if (len(tweet['entities']['hashtags']) > 0): # feature.append(len(tweet['entities']['hashtags'])) feature.append(1) else: feature.append(0) # has usermention if (len(tweet['entities']['user_mentions']) > 0): # feature.append(len(tweet['entities']['user_mentions'])) feature.append(1) else: feature.append(0) # has url if (len(tweet['entities']['urls']) > 0): # feature.append(len(tweet['entities']['urls'])) feature.append(1) else: feature.append(0) # has media if ('media' in tweet['entities']): # feature.append(len(tweet['entities']['media'])) feature.append(1) else: feature.append(0) # sentiment analysis clean_tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()) analysis = TextBlob(clean_tweet) if analysis.sentiment.polarity > 0: feature.append(1) else: feature.append(0) # # has poll # if ('polls' in tweet['entities']): # # feature.append(len(tweet['entities']['media'])) # feature.append(1) # else: # feature.append(0) # Likes # if ((tweet['favorite_count']) > 0): # # feature.append(len(tweet['entities']['media'])) # feature.append((tweet['favorite_count'])) # else: # feature.append(0) # # Retweets # if ((tweet['retweet_count']) > 0): # # feature.append(len(tweet['entities']['media'])) # feature.append((tweet['retweet_count'])) # else: # feature.append(0) # favourited # if ('favourited' in tweet and tweet['favourited']): # feature.append(1) # else: # feature.append(0) # # Retweeted # if ('retweeted' in tweet and tweet['retweeted']): # feature.append(1) # else: # feature.append(0) # is source # if (source == ) # Capital to lower case ratio uppers = [l for l in text if l.isupper()] capitalratio = len(uppers) / len(text) feature.append(capitalratio) count_punct = 0 # negative words list neg_words = [ "not", "no", "nobody", "none", "never", "neither", "nor", "nowhere", "hardly", "scarcely", "barely", "don't", "isn't", "wasn't", "shouldn't", "wouldn't", "couldn't", "doesn't" ] count_neg_words = 0 # count number of punctuations and negative words for i in words: if (i in (string.punctuation)): count_punct += 1 if (i in neg_words): count_neg_words += 1 feature.append(count_punct) feature.append(count_neg_words) swearwords = [] with open('badwords.txt', 'r') as f: for line in f: swearwords.append(line.strip().lower()) hasswearwords = 0 for token in word_nopunc: if token in swearwords: hasswearwords += 1 feature.append(hasswearwords) return feature
def visualize(e_visualization, s_visualization, file_name): file_handler = open(file_name,"r").read() toker = RegexpTokenizer(r'\w+') words = toker.tokenize(file_handler) allowed_types = ["JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] filtered_words = [] #stopwords removal for w in words: if w not in stop_words: filtered_words.append(w) pos = nltk.pos_tag(filtered_words) allowed_words = [] #print(allowed_words) for p in pos: if p[1] in allowed_types: allowed_words.append(p[0].lower()) e_pos = [] e_neg = [] s_pos = [] s_neg = [] for d in e_visualization: #length = len(d) #if(length == 8): if('anger' in d and d['anger'] == 0): words = findAnger(allowed_words) e_neg.append(words) else: words = findAnger(allowed_words) e_pos.append(words) if('ant' in d and d['ant'] == 0): words = findAnt(allowed_words) e_neg.append(words) else: words = findAnt(allowed_words) e_pos.append(words) if('disgust' in d and d['disgust'] == 0): words = findDisgust(allowed_words) e_neg.append(words) else: words = findDisgust(allowed_words) e_pos.append(words) if('fear' in d and d['fear'] == 0): words = findFear(allowed_words) e_neg.append(words) else: words = findFear(allowed_words) e_pos.append(words) if('joy' in d and d['joy'] == 0): words = findJoy(allowed_words) e_neg.append(words) else: words = findJoy(allowed_words) e_pos.append(words) if('sadness' in d and d['sadness'] == 0): words = findSadness(allowed_words) e_neg.append(words) else: words = findSadness(allowed_words) e_pos.append(words) if('surprise' in d and d['surprise'] == 0): words = findSurprise(allowed_words) e_neg.append(words) else: words = findSurprise(allowed_words) e_pos.append(words) if('trust' in d and d['trust'] == 0): words = findTrust(allowed_words) e_neg.append(words) else: words = findTrust(allowed_words) e_pos.append(words) #if(length == 2): if('positivity' in d and d['positivity'] == 0): words = findPos(allowed_words) e_neg.append(words) else: words = findPos(allowed_words) e_pos.append(words) if('negativity' in d and d['negativity'] == 0): words = findNeg(allowed_words) e_neg.append(words) else: words = findNeg(allowed_words) e_pos.append(words) for d in s_visualization: #length = len(d) #if(length == 8): if('anger' in d and d['anger'] == 0): words = findAnger(allowed_words) s_neg.append(words) else: words = findAnger(allowed_words) s_pos.append(words) if('ant' in d and d['ant'] == 0): words = findAnt(allowed_words) s_neg.append(words) else: words = findAnt(allowed_words) s_pos.append(words) if('disgust' in d and d['disgust'] == 0): words = findDisgust(allowed_words) s_neg.append(words) else: words = findDisgust(allowed_words) s_pos.append(words) if('fear' in d and d['fear'] == 0): words = findFear(allowed_words) s_neg.append(words) else: words = findFear(allowed_words) s_pos.append(words) if('joy' in d and d['joy'] == 0): words = findJoy(allowed_words) s_neg.append(words) else: words = findJoy(allowed_words) s_pos.append(words) if('sadness' in d and d['sadness'] == 0): words = findSadness(allowed_words) s_neg.append(words) else: words = findSadness(allowed_words) s_pos.append(words) if('surprise' in d and d['surprise'] == 0): words = findSurprise(allowed_words) s_neg.append(words) else: words = findSurprise(allowed_words) s_pos.append(words) if('trust' in d and d['trust'] == 0): words = findTrust(allowed_words) s_neg.append(words) else: words = findTrust(allowed_words) s_pos.append(words) #if(length == 2): if('positivity' in d and d['positivity'] == 0): words = findPos(allowed_words) s_neg.append(words) else: words = findPos(allowed_words) s_pos.append(words) if('negativity' in d and d['negativity'] == 0): words = findNeg(allowed_words) s_neg.append(words) else: words = findNeg(allowed_words) s_pos.append(words) final_visualization = {'e_pos':e_pos, 'e_neg':e_neg, 's_pos':s_pos, 's_neg':s_neg} return final_visualization
#Create dataframe and store the data from IMDB_Dataset.csv data = pd.DataFrame() data = pd.read_csv('IMDB_Dataset.csv', encoding='utf-8') data.head() #create empty list review_data_list = list() indv_lines = data['review'].values.tolist() for line in indv_lines: #create word tokens as well as remove puntuation in one go rem_tok_punc = RegexpTokenizer(r'\w+') tokens = rem_tok_punc.tokenize(line) #convert the words to lower case words = [w.lower() for w in tokens] #Invoke all the english stopwords stop_word_list = set(stopwords.words('english')) #Remove stop words words = [w for w in words if not w in stop_word_list] #Append words in the review_data_list list. review_data_list.append(words) len(review_data_list) #Train a Word2Vec model using Gensim
# Replacing or removing emojis. text = demoji.replace(text, " ") # Lowercasing. text = text.lower() # Removing punctuation. text = rm_punctuation(text) # Removing stopwords - i.e. the, a, an, he. text = rm_stopwords(text) # ????some negation handling - how to keep the negation meaning???? # Tokenization. text = tokenizer.tokenize(text) # Removing repeating letters (i.e. awesooome to awesome, *) #text = correct_text(text) # lemmatizing the words text = lemmatize_text(text) # Puts the cleaned, tokenized text data back into the data frame. data['text'].loc[i] = text # Removing all the rows that are empty in the text column after cleaning data = data.replace("", np.nan).replace([], np.nan).dropna() ########################################### ADDING COLUMNS TO DATA ########################################### data['states'] = data['place_full_name'].apply(lambda row: get_state(row))
from nltk.stem import WordNetLemmatizer from nltk.tokenize import RegexpTokenizer #Importing module for sentence tokenisation from nltk.stem import PorterStemmer stemmer = PorterStemmer() tokenizer = RegexpTokenizer( "[\w']+") #Function to tokenise from regular expression lemmatizer = WordNetLemmatizer() sentence = "You would need to add materials that you need to use. You also would want to know how much vinegar you should pour in the cups. You should also say what you should label the container with if it should be the sample or letters like A" arr = [] arr1 = [] sentence = tokenizer.tokenize(sentence) for i in sentence: j = stemmer.stem(i) arr.append(j) k = lemmatizer.lemmatize(i) arr1.append(k) print(sentence) print('\n') print("--------------------------------------------------------") print('\n') print(arr) print('\n') print("*********************************************************") print('\n') print(arr1)
# create English stop words list en_stop = stopwords.words('english') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) #print(texts) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) #print(dictionary.token2id)
def preprocessor(self, text, max_words=0): env = Environment() t_start = timer() text2 = text.lower() env.debug(1, ['Analyzer', 'preprocessor', 'START Preprocessing:']) tokenizer = RegexpTokenizer(self.word_tokenizers_custom()) tokens_words = tokenizer.tokenize(text2) # Слова текста tokens_sent = sent_tokenize( text2) # Предложения - пока не используются в нашем проекте n_words_count = len(tokens_words) # Количество слов в тексте n_sent_count = len(tokens_sent) # Количество предложений в тексте n_sent_len_mean = n_words_count / n_sent_count # Средняя длина предложения в словах #Делим текст на части - chunks awords = [] #Массив # Если документ большой, разделяем его на несколько частей (chunks) и считаем # статистику для каждого в отдельности. # Это нам позволит имея небольшое число объёмных документов корректно обучить модель if (max_words > 0): n_sent_chunk = int( max_words // n_sent_len_mean ) #Сколько предложение в 1 chunks содержащее max_words print('n_sent_chunk', n_sent_chunk) #подбираем, чтобы текст был разделен равномерно i_chunks = 1 tmp_sent_chunk = n_sent_count while tmp_sent_chunk > n_sent_chunk: i_chunks = i_chunks + 1 tmp_sent_chunk = int( math.ceil(n_sent_count // i_chunks) + (n_sent_count % i_chunks)) n = 0 n_sent_chunk = tmp_sent_chunk #итоговое значение сколько предложений пойдет в chunk print('tmp_sent_chunk', tmp_sent_chunk) while n < n_sent_count: #print(n, n_sent_chunk) asents = tokens_sent[ n:n + n_sent_chunk] #Предложения от n до n+chunk #print(asents) a_sent_words = [] #слова текущей группы предложений for sent in asents: words = tokenizer.tokenize(sent) a_sent_words.extend(words) #print(a_sent_words) awords.append([ n_sent_count, n_words_count, len(a_sent_words) / len(asents), len(asents), len(a_sent_words), a_sent_words ]) n = n + n_sent_chunk else: awords.append([ n_sent_count, n_words_count, n_sent_len_mean, len(tokens_sent), len(tokens_words), tokens_words ]) #print(awords) t_end = timer() env.debug(1, ['Preprocessed:', 'time:', env.job_time(t_start, t_end)]) return awords #Массив со словами и статистикой
def querydocsim(query, filename): qt = [] ch = [] ft = [] fv = [] qd = [] etq = [] qdv = [] et = [] et1 = [] eb = [] qdv = [] qry = query tokenizer = RegexpTokenizer(r'[a-zA-Z]+') tokns = tokenizer.tokenize(qry) ch = sorted((stopwords.words('english'))) tokns = [w for w in tokns if not w.lower() in ch] for p in range(len(tokns)): stemmer = PorterStemmer() tokns[p] = stemmer.stem(tokns[p]) cin = Counter(tokns) qt = list(cin.keys()) for i in range(len(qt)): et.append(qt[i]) for i in range(len(et)): et1.append(0) qd = list(cin.values()) for i in range(len(qd)): eb.append(qd[i]) #idf of particular word for l in range(len(visited)): for j in range((len(et))): if et[j] == visited[l]: et1[j] = vm[l] #tf idf product for i in range(len(qd)): sq = et1[i] * (1 + math.log10(eb[i])) etq.append(sq) qv = 0 #vector normalization of query for j in range(len(eb)): qv = qv + math.pow(etq[j], 2) for j in range(len(eb)): cq = etq[j] / math.sqrt(qv) qdv.append(cq) fl = filename if fl in v: for j in range(len(v)): if fl == v[j]: h = j else: return (0) rc = [] rt = cx[h - 1] ru = cx[h] for i in range(rt, ru - 1): ft.append(q[i]) fv.append(cdv[i]) for i in range(len(et)): fry = et[i] if fry in ft: for j in range(len(ft)): if fry == ft[j]: rc.append(qdv[i] * fv[j]) else: rc.append(0) so = 0 cs = 0 for pr in range(len(rc)): so = rc.pop() cs = cs + so return (cs)
def gen_example(wordtoix, algo, imsize, image_transform, norm, data_dir): '''generate images from example sentences''' from nltk.tokenize import RegexpTokenizer filepath = '%s/example_filenames.txt' % (cfg.DATA_DIR) data_dic = {} with open(filepath, "r") as f: filenames = f.read().split('\n') for name in filenames: if len(name) == 0: continue flip = random.rand() > 0.5 new_w = new_h = int(256 * 76 / 64) x = random.randint(0, np.maximum(0, new_w - 256)) y = random.randint(0, np.maximum(0, new_h - 256)) img_name = name.replace("text", "images") img_path = '%s/%s.jpg' % (data_dir, img_name) imgs = get_imgs(img_path, imsize, flip, x, y, None, image_transform, norm) real_imgs = [] for i in range(len(imgs)): if cfg.CUDA: real_imgs.append(Variable(imgs[i]).cuda()) else: real_imgs.append(Variable(imgs[i])) filepath = '%s/%s.txt' % (cfg.DATA_DIR, name) with open(filepath, "r") as f: print('Load from:', name) sentences = f.read().split('\n') # a list of indices for a sentence captions = [] cap_lens = [] for sent in sentences: if len(sent) == 0: continue sent = sent.replace("\ufffd\ufffd", " ") tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sent.lower()) if len(tokens) == 0: print('sent', sent) continue rev = [] for t in tokens: t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0 and t in wordtoix: rev.append(wordtoix[t]) captions.append(rev) cap_lens.append(len(rev)) max_len = np.max(cap_lens) sorted_indices = np.argsort(cap_lens)[::-1] cap_lens = np.asarray(cap_lens) cap_lens = cap_lens[sorted_indices] cap_array = np.zeros((len(captions), max_len), dtype='int64') for i in range(len(captions)): idx = sorted_indices[i] cap = captions[idx] c_len = len(cap) cap_array[i, :c_len] = cap key = name[(name.rfind('/') + 1):] data_dic[key] = [cap_array, cap_lens, sorted_indices, real_imgs] algo.gen_example(data_dic)
import re import scipy from nltk.tokenize import RegexpTokenizer import numpy as np import scipy.sparse import pickle as pickle from scipy.sparse.linalg import svds import math # I will be using 1 late day # mp3564 # Preprocessing with io.open('data/brown.txt', 'r') as dataset: data = dataset.readlines() tokenizer = RegexpTokenizer(r'\w+') sent_all = [tokenizer.tokenize(datapoint.lower()) for datapoint in data] unique_set = set(word for sent in sent_all for word in sent) unique = sorted(list(unique_set)) vocab_size = len(unique_set) # Word2Vec Model model = Word2Vec(sent_all, size=300, window=2, negative=5) word_vectors = model.wv word_vectors.save("modelswv.kv") word_vectors = KeyedVectors.load("modelswv.kv") # SVD Helper Functions
# ignore captions with manually selected bad words if check_bad_words(lemmatized_list) == False: continue # analys word composition unigram_list, bigram_list = update_ngram_freq(lemmatized_list) # ignore meaningless captions if len(unigram_list) == 0 or len(bigram_list) == 0: continue sentence = {} # sentence['raw'] = cap # raw caption sentence['clean'] = reduced_cap # cleaned caption sentence['tokens'] = tokenizer.tokenize(reduced_cap) # tokens sentence['unigrams'] = unigram_list sentence['bigrams'] = bigram_list img["sentences"].append(sentence) img["filename"] = imgID # filename of image img["url"] = raw_data[imgID]["image_url"] # download url of image image_list.append(img) # print most common unigrams and bigrams with their frenquency # print ('\n'.join([i + '\t'+ str(j) for i, j in Counter(unigram_dict).most_common()])) # print ('\n'.join([i + '\t' + str(j) for i, j in Counter(bigram_dict).most_common()]))
from nltk.tokenize import TreebankWordTokenizer tbwt = TreebankWordTokenizer() print(tbwt.tokenize(english_text)) complex_text = "This is a free country, isn't it?" print(tbwt.tokenize(complex_text)) #Reg Exp tokenize from nltk.tokenize import RegexpTokenizer reg = RegexpTokenizer("[a-zA-Z0-9\']+") print(reg.tokenize(complex_text)) #Stop words removal from nltk.corpus import stopwords sw = set(stopwords.words('english')) print(sw) out = [ word.lower() for word in reg.tokenize(complex_text) if word.lower() not in sw ] print(out) #Stemming
train, test = train_test_split(df, test_size=0.33, random_state=42) columnsData = df.loc[:, 'comment_text'] columnsDataTrain = train.loc[:, 'comment_text'] columnsDataTest = test.loc[:, 'comment_text'] #Total column Data yo = columnsData.unique() setyo = set(yo) mylist = list(setyo) str1 = ''.join(mylist) tokenizer = RegexpTokenizer(r'\w+') #Length column Data wordLength = len(tokenizer.tokenize(str1)) wordList = tokenizer.tokenize(str1) unique = set(wordList) uniqueWordList = list(unique) uniqueLength = len(uniqueWordList) #Total Train Data yo1 = columnsDataTrain.unique() setyo1 = set(yo1) mylist1 = list(setyo1) str2 = ''.join(mylist1) tokenizer1 = RegexpTokenizer(r'\w+') #Length Train Data wordLength1 = len(tokenizer1.tokenize(str2)) wordList1 = tokenizer1.tokenize(str2)
p_stemmer = PorterStemmer() # Create English stop words stopset = stopwords.words('english') #check if word contains digits def isContainPorD(s): return re.search(r'(\d)', s) #loop through document list for index,docu in enumerate(data): if index>4000: break content=docu["Content"].lower() #remove stop words and digits and punctuations removed_tokens = [i for i in tokenizer.tokenize(content) if i not in stopset and not isContainPorD(i)] #stem tokens tokens = [p_stemmer.stem(i) for i in removed_tokens] #preprocessed texts texts.append(tokens) num_of_testcase=50# testcase number test_data=texts[:num_of_testcase] input_data=[] extra_data=[] for text in test_data: # random.shuffle(text) input_data.append(text[:len(text)/2]) extra_data.append(text[len(text)/2:])
response = requests.get(url) raw = response.text # lowercase the text raw = raw.lower() print "type of text", type(raw) print "length of text:", len(raw) print "first 100 characters:", raw[:100] print "==========" # create a version of the text without puncuation from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') raw_nonpunct = tokenizer.tokenize(raw) ### want to replace contractions before tokenizing #replacer = RegexpReplacer() #raw_replaced = nltk.word_tokenize(replacer.replace(raw)) ### Now, to tokenize the text sent_tokens = nltk.sent_tokenize(raw) word_tokens = nltk.word_tokenize(raw) # filter the text for stopwords from nltk.corpus import stopwords english_stops = set(stopwords.words('english')) word_tokens_clean = [word for word in raw_nonpunct if word not in english_stops] COMC_text = nltk.Text(word_tokens_clean) raw_text = nltk.Text(word_tokens)
with open('data/twitter_tweets_pruned.json') as data: identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) nltk.download('stopwords') nltk.download('punkt') tokenizer = RegexpTokenizer(r'\w+') data = json.load(data) copy = {} stop_words = set(stopwords.words('english')) i = 0 porter = PorterStemmer() # stop word removal for key, values in data.items(): copy[key] = [] for val in values: tweet = tokenizer.tokenize(val) filtered_tweet = [] for w in tweet: if w not in stop_words: w = porter.stem(w) filtered_tweet.append(w) tmp = " ".join(str(x) for x in filtered_tweet) tmp2 = tmp.encode('ascii', 'ignore').decode("utf-8") lang = (identifier.classify(tmp2))[0] if lang == "en": copy[key].append(tmp) i = i + 1 print(i) with open('data/twitter_tweets_no_unicode_eng.json', 'w') as output1: json.dump(copy, output1)
porter = PorterStemmer() #Stemming on the words using porter's algorithm stop_words=set(stopwords.words('english')) inverted_index={} file_name_ID={} path='D:/MTECH/SEM 2/Information Retrieval/Assignments/Assignment 1/20_newsgroups' #Path to the document collection i=0 words_list=[] for root,dirs,files in os.walk(path,topdown=False): for name in files: directory=root.split("\\",1)[1] file_name_ID[i]=directory+'/'+name path_file=os.path.join(root,name) header_processed_string=header_preprocess(path_file) words=tokenizer.tokenize(header_processed_string) # To remove punctuation mark, comma etc and to form tokens for word in words: if (word not in stop_words): # Removing Stop Words word=word.lower() # Normalization try: stemmed_word=porter.stem(word) stemmed_word = unicode(stemmed_word, errors='ignore') except: pass words_list.append(stemmed_word) if (stemmed_word not in inverted_index): inverted_index[stemmed_word]=[] if i not in inverted_index[stemmed_word]: inverted_index[stemmed_word].append(i) i+=1
def prepare_data(filepath, num_data_points=40000, vocab_size=4000, max_length=500): train_set_proportion = 0.9 train_size = int(num_data_points * train_set_proportion) print("Preparing Data...") current_file = open(filepath, "rb") x = current_file.read() current_file.close() x = x.decode("utf-8") x = x.splitlines() random.shuffle(x) x = x[:num_data_points] labels = [] reviews = [] reTokenizer = RegexpTokenizer(r'\w+') for i in x: separated = i.split(" ", 1) labels.append(separated[0]) reviews.append(separated[1]) for i in range(len(labels)): labels[i] = int(labels[i] == '__label__1') all_words = [] for i in range(len(reviews)): tokens = reTokenizer.tokenize(reviews[i]) reviews[i] = [] for word in tokens: word = word.lower() all_words.append(word) reviews[i].append(word) vocab_pickle_location = os.path.join(vocab_directory, "all_words.pkl") if not os.path.isdir(vocab_directory): print("Error: vocab_directory doesn't exist!") else: all_words = pickle.load(open(vocab_pickle_location, 'rb')) all_words = all_words[:vocab_size] word2int = {all_words[i][0]: i + 1 for i in range(vocab_size)} # int2word = {x: y for y, x in word2int.items()} # dict_as_list = list(word2int) def review2intlist(rev_text): int_list = [] for word in rev_text: if word in word2int.keys(): int_list.append(word2int[word]) return int_list X = [] for i in range(len(reviews)): X.append(review2intlist(reviews[i])) X = sequence.pad_sequences(X, maxlen=max_length) LSTM_inputs = np.zeros(shape=(max_length, num_data_points), dtype=np.float32) for i in range(num_data_points): LSTM_inputs[:, i] = X[i] LSTM_inputs = LSTM_inputs.T LSTM_outputs = np.zeros(shape=num_data_points) for i in range(num_data_points): LSTM_outputs[i] = labels[i] x_train, y_train = LSTM_inputs[:train_size], LSTM_outputs[:train_size] x_test, y_test = LSTM_inputs[train_size:], LSTM_outputs[train_size:] half_test_size = int(len(y_test)/2) x_valid = x_test[:half_test_size] y_valid = y_test[:half_test_size] x_test = x_test[half_test_size:] y_test = y_test[half_test_size:] print("Finished preparing data...") return x_train, y_train, x_test, y_test, x_valid, y_valid
import nltk from nltk.tokenize import RegexpTokenizer puncts_exceptapostrophe = '!"#$%&()*+,-./:;<=>?@[\]^`{|}~' TOKENIZE_PATTERN = fr"[{puncts_exceptapostrophe}]|\w+|['\w]+" regex_tokenizer = RegexpTokenizer(pattern=TOKENIZE_PATTERN) output_file = open("output.txt", "w") with open('tr.txt', "r") as reader: lines = reader.readlines() for line in lines: line = line.lower() tokens = regex_tokenizer.tokenize(line) sentence = " ".join(tokens) output_file.write(sentence + "\n") output_file.close()
def dataset(): global tokens_triples global num_sentences global included_sentences global processed_facts global num_triples global num_tokens global num_annotations global unique_predicates global predicates global dictionary if os.path.exists(exp_dir + 'experiment.csv'): os.remove(exp_dir + 'experiment.csv') tokenizer = RegexpTokenizer(r'\w+') with open(exp_dir + 'experiment.csv', 'wb') as exp_file: fieldnames = ['Sentence'] writer = csv.DictWriter(exp_file, fieldnames=fieldnames) writer.writeheader() for filename in glob.glob(os.path.join(csv_dir, '*.csv')): #print filename with open(filename, 'rb') as csv_file: csv_reader = csv.reader(csv_file) csv_reader.next() rows_flag = 0 with open( filename.replace('CSV', 'XML').replace('csv', 'xml'), 'r') as xml_file: sentence_number = -1 xml = xml_file.read() xml = unicode(xml, 'ascii', errors='ignore') root = ET.fromstring(xml) for row in csv_reader: sentence_number = sentence_number + 1 num_sentences = num_sentences + 1 """ if len(root[sentence_number][5]) >= 1 and row[0].find('?') == -1: rows_flag = rows_flag + 1 dictionary.append({'annotated_sentence': root[sentence_number][2].text, 'value': root[sentence_number][0].text, 'triples': [], 'simplification': ''}) for triple in range(0, len(root[sentence_number][5])): dictionary[sum(included_sentences)]['triples'].append(root[sentence_number][5][triple].text) included_sentences[0] = included_sentences[0] + 1 writer.writerow({'Sentence': row[0]}) num_triples.append(len(root[sentence_number][5])) num_tokens.append(len(tokenizer.tokenize(root[sentence_number][0].text))) tokens_triples.append(len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5])) processed_facts = processed_facts + len(root[sentence_number][5]) """ if len(root[sentence_number][5]) >= 1: if row[0].find('?') == -1 and ( len( tokenizer.tokenize( root[sentence_number][0].text)) / len(root[sentence_number][5]) >= 20) and included_sentences[2] < 200: rows_flag = rows_flag + 1 dictionary.append({ 'annotated_sentence': root[sentence_number][2].text, 'value': root[sentence_number][0].text, 'triples': [], 'simplification': '' }) for triple in range( 0, len(root[sentence_number][5])): dictionary[sum( included_sentences)]['triples'].append( root[sentence_number][5] [triple].text) included_sentences[ 2] = included_sentences[2] + 1 writer.writerow({'Sentence': row[0]}) #print row[0] num_annotations = num_annotations + get_annotations( filename.rsplit('/', 1)[-1], root[sentence_number][2].text) for triple in range( 0, len(root[sentence_number][5])): predicate = get_predicate( root[sentence_number][5][triple].text) if predicate not in unique_predicates: unique_predicates.append(predicate) predicates[predicate] = 1 else: predicates[predicate] = predicates[ predicate] + 1 num_triples.append( len(root[sentence_number][5])) num_tokens.append( len( tokenizer.tokenize( root[sentence_number][0].text))) tokens_triples.append( len( tokenizer.tokenize( root[sentence_number][0].text)) / len(root[sentence_number][5])) #print sentence_number #print xml_file processed_facts = processed_facts + len( root[sentence_number][5]) if row[0].find('?') == -1 and (len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) < 20) \ and len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) >= 10 and included_sentences[1] < 200: rows_flag = rows_flag + 1 dictionary.append({ 'annotated_sentence': root[sentence_number][2].text, 'value': root[sentence_number][0].text, 'triples': [], 'simplification': '' }) for triple in range( 0, len(root[sentence_number][5])): dictionary[sum( included_sentences)]['triples'].append( root[sentence_number][5] [triple].text) included_sentences[ 1] = included_sentences[1] + 1 writer.writerow({'Sentence': row[0]}) #print row[0] num_annotations = num_annotations + get_annotations( filename.rsplit('/', 1)[-1], root[sentence_number][2].text) for triple in range( 0, len(root[sentence_number][5])): predicate = get_predicate( root[sentence_number][5][triple].text) if predicate not in unique_predicates: unique_predicates.append(predicate) predicates[predicate] = 1 else: predicates[predicate] = predicates[ predicate] + 1 num_triples.append( len(root[sentence_number][5])) num_tokens.append( len( tokenizer.tokenize( root[sentence_number][0].text))) tokens_triples.append( len( tokenizer.tokenize( root[sentence_number][0].text)) / len(root[sentence_number][5])) #print sentence_number #print xml_file processed_facts = processed_facts + len( root[sentence_number][5]) if row[0].find('?') == -1 and (len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) < 10) \ and len(tokenizer.tokenize(root[sentence_number][0].text)) / len(root[sentence_number][5]) >= 5 and included_sentences[0] < 200: rows_flag = rows_flag + 1 dictionary.append({ 'annotated_sentence': root[sentence_number][2].text, 'value': root[sentence_number][0].text, 'triples': [], 'simplification': '' }) for triple in range( 0, len(root[sentence_number][5])): dictionary[sum( included_sentences)]['triples'].append( root[sentence_number][5] [triple].text) included_sentences[ 0] = included_sentences[0] + 1 writer.writerow({'Sentence': row[0]}) #print row[0] num_annotations = num_annotations + get_annotations( filename.rsplit('/', 1)[-1], root[sentence_number][2].text) for triple in range( 0, len(root[sentence_number][5])): predicate = get_predicate( root[sentence_number][5][triple].text) if predicate not in unique_predicates: unique_predicates.append(predicate) predicates[predicate] = 1 else: predicates[predicate] = predicates[ predicate] + 1 num_triples.append( len(root[sentence_number][5])) num_tokens.append( len( tokenizer.tokenize( root[sentence_number][0].text))) tokens_triples.append( len( tokenizer.tokenize( root[sentence_number][0].text)) / len(root[sentence_number][5])) #print sentence_number #print xml_file processed_facts = processed_facts + len( root[sentence_number][5]) xml_file.close() csv_file.close() exp_file.close() # It sorts the dictionary of predicates according to the times of occurrence. #print sorted(predicates.items(), key=lambda x:x[1], reverse=True) print('%d out of the total %d sentences have been included.' % (sum(included_sentences), num_sentences)) print('Total number of facts-triples that have been included: %d' % (processed_facts)) print( 'Total number of tokens of the sentences that have been included: %d' % (sum(num_tokens))) print( 'Total number of arguments of the sentences that have been included: %d' % (num_annotations)) print( 'Total number of unique predicates of the sentences that have been included: %d' % (len(unique_predicates)))
########################################################### #### Preprocess the tweets to get sets of word indices #### #### a table of words to word indices, and its reverse #### ########################################################### tokenizer = RegexpTokenizer(r"@?(\w+'\w+)|(\w+)") tweets = [] word_indices = {} all_words = [] curr_word_index = 0 with open("tweetset.txt", "r") as tweets_file: for line in tweets_file: words = tokenizer.tokenize(line) curr_tweet = set() for word in words: word = word.lower() if is_stop(word): continue if word in word_indices: curr_tweet.add(word_indices[word]) else: word_indices[word] = curr_word_index all_words.append(word) curr_tweet.add(curr_word_index) curr_word_index += 1 tweets.append(curr_tweet) ####################################################### #### Constructing the adjacency matrix from tweets ####
import nltk from nltk.tokenize import RegexpTokenizer #import os #os.chdir(r"C:\Users\Bertold\Documents\CUNY\Fall 2019\Intro to Computational Linguistics\Final") with open("DC_transcript.txt") as fin: transcript = fin.read() #Two text files are included. Paste above or just change DC to LB. Filenames: #LB_transcript.txt (Lewis Black) #DC_transcript.txt (Dave Chappelle) regxptokenizer = RegexpTokenizer(r'\w+') lowercasetext = transcript.lower() nopuncttxt = regxptokenizer.tokenize(lowercasetext) arpabet = nltk.corpus.cmudict.dict() def phoneme_counter(str): Kcount = 0 for word in nopuncttxt: try: print(arpabet[word][0]) except KeyError: pass try: for j in range(len(arpabet[word][0])): try: if arpabet[word][0][j] == "K": Kcount += 1
#train_text+= state_union.raw("1951-Truman.txt") #train_text+= state_union.raw("1950-Truman.txt") #train_text+= state_union.raw("1949-Truman.txt") #train_text+= state_union.raw("1948-Truman.txt") #train_text+= state_union.raw("1946-Truman.txt") #train_text+= state_union.raw("1945-Truman.txt") #train_text+= state_union.raw("1953-Eisenhower.txt") #train_text+= state_union.raw("1954-Eisenhower.txt") #train_text+= state_union.raw("1955-Eisenhower.txt") #train_text+= state_union.raw("1956-Eisenhower.txt") stop_words = set(stopwords.words("english")) #Tokenizing the sentence tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(train_text) #Stemmer and Lemmatizer instance created ps = PorterStemmer() lemmatizer = WordNetLemmatizer() #The sequence to get the stop words removed from the sentence filtered_text = [] #Lemmatizing words and adding to the final array if they are not stopwords for w in words: if w not in stop_words: w = lemmatizer.lemmatize(w) filtered_text.append(w) #print(filtered_text) print("Corpus Words: ", len(filtered_text))
def data_preperation(self): print("data_preperation Activated") delimiter = RegexpTokenizer('\s+', gaps=True) # delimiters matching self.tokenized_review_list = [ delimiter.tokenize(i) for i in self.review_list ] self.tokenized_description_list = [ delimiter.tokenize(i) for i in self.description_list ] all_words = [] max_len_description = 0 idx = 0 for recipe in self.tokenized_description_list: for word in recipe: idx += 1 if word not in all_words: all_words.append(word) if (idx > max_len_description): max_len_description = idx idx = 0 max_len_review = 0 idx = 0 for recipe in self.tokenized_review_list: for word in recipe: idx += 1 if word not in all_words: all_words.append(word) if (idx > max_len_review): max_len_review = idx idx = 0 self.all_words = all_words self.max_len_description = max_len_description self.max_len_review = max_len_review # zipped = zip(description_list,review_list) # Encoding 1 # encoded_description = [one_hot(d, vocab_size) for d in description_list] # encoded_review = [one_hot(d, vocab_size) for d in review_list] # print(len(encoded_description[0])) # Encoding 2 self.vocab_size = len(all_words) max_words = self.vocab_size + 5 t = Tokenizer(num_words=max_words) # words --> integers t.fit_on_texts(self.description_list + self.review_list) encoded_des = list(t.texts_to_sequences(self.description_list)) encoded_rev = list(t.texts_to_sequences(self.review_list)) self.tokenizer = t # Pad-Sequence - Zero Padding # self.padded_encoded_description = pad_sequences(encoded_des, maxlen=self.max_len_description, padding='post') # self.padded_encoded_review = pad_sequences(encoded_rev, maxlen=self.max_len_review, padding='post') self.padded_encoded_description = pad_sequences( encoded_des, maxlen=self.max_len_description, padding='pre') self.padded_encoded_review = pad_sequences(encoded_rev, maxlen=self.max_len_review, padding='pre') print(self.padded_encoded_description[0])