def __init__(self,grammar,tags): self.lines = [] repeat_line_array = nltk.word_tokenize(grammar.gen_frame_line(grammar.cfg.start())) x = random.randint(0,8) y = random.randint(0,8) for i in range(8): if (i == x or i == y): spot_array = [] j = 0 noun_set = set(['he','she','it','I']) for wop in repeat_line_array: if wop in set(tags): spot = Spot(wop,i,j,'POS') if (wop in noun_set): spot.add_POS('NN') spot_array.append(spot) else: spot = Spot(wop,i,j,'word') spot_array.append(spot) j += 1 self.lines.append(spot_array) else: line_array = nltk.word_tokenize(grammar.gen_frame_line(grammar.cfg.start())) spot_array = [] j = 0 for wop in line_array: if wop in set(tags): spot = Spot(wop,i,j,'POS') spot_array.append(spot) else: spot = Spot(wop,i,j,'word') spot_array.append(spot) j += 1 self.lines.append(spot_array)
def jaccard(a,b): A,B=nltk.word_tokenize(a),nltk.word_tokenize(b) A,B = set(A),set(B) intersect = len(A.intersection(B)) union = len(A.union(B)) coef = float(intersect)/union return coef
def build_s(data): ''' Compute the context vector for each lexelt :param data: dic with the following structure: { lexelt: [(instance_id, left_context, head, right_context, sense_id), ...], ... } :return: dic s with the following structure: { lexelt: [w1,w2,w3, ...], ... } ''' s = {} for lexelt, lexelt_info in data.items(): words = set() for (instance_id, left_context, head, right_context, sense_id) in lexelt_info: left_tokens = nltk.word_tokenize(left_context) right_tokens = nltk.word_tokenize(right_context) words.update(k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)) s[lexelt] = list(words) return s
def load_file_without_frequency(self,positif, negatif): tab = [] maxs = self.nbFeatures phrases = [] y = [] with codecs.open(positif,"r",encoding='latin-1') as my_file: for line in my_file: line= line.strip().lower() # remove the \n* phrases.append(line) y.append(1) for mot in word_tokenize(line): tab.append(mot) with codecs.open(negatif,"r",encoding='latin-1') as my_file: for line in my_file: line= line.strip().lower() # remove the \n* phrases.append(line) y.append(0) for mot in word_tokenize(line): tab.append(mot) word_fd = FreqDist(tab) print(word_fd) for i in range(len(phrases)): mots = word_tokenize(phrases[i]) tmp = [] for element in mots: tmp.append(word_fd[element]) if(len(tmp) < maxs): for j in range(maxs - len(tmp)): tmp.append(0) elif(len(tmp)>maxs): tmp = tmp[:maxs] phrases[i] = tmp return (np.array(phrases),np.array(list(set(tab))),np.array(y))
def vectorize(data, s): ''' :param data: list of instances for a given lexelt with the following structure: { [(instance_id, left_context, head, right_context, sense_id), ...] } :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...] :return: vectors: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } labels: A dictionary with the following structure { instance_id : sense_id } ''' vectors = {} labels = {} for (instance_id, left_context, head, right_context, sense_id) in data: labels[instance_id] = sense_id left_tokens = nltk.word_tokenize(left_context) right_tokens = nltk.word_tokenize(right_context) words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size) vectors[instance_id] = frequency_vector_from_near_words(s, words) return vectors, labels
def tester(): tweetList = readPickleFile(pickledTweets) resultList = readTextFile(cleanTweets) count1 = 0 count2 = 0 precision = 0 for tweet, result in zip(tweetList, resultList): count1 = count1 +1 print(tweet) extract = extraction.runner(tweet) extractTokens = nltk.word_tokenize(extract) resultTokens = nltk.word_tokenize(result) precisionList = [word for word in resultTokens if word in extractTokens] if len(precisionList) == len(resultTokens): precision = precision + 1 print('Exrt: ' + extract) print('Cort: ' + result) if (extract.strip() == result.strip()): count2 = count2 +1 print('Precision: ' + str(precision) + ' out of ' + str(count1) + ' or ' + str(float("{0:.2f}".format(precision/float(count1)))) + '% of retrieved instances are relevant (positive predictive value)') print('Recall: ' + str(count2) +' out of ' + str(precision) + ' or ' + str(float("{0:.2f}".format(count2/float(precision)))) + '% of the relevant instances are retrieved (sensitivity)')
def __init__(self, title, full_text, sentence): self.title = title self.sentence = sentence # map of word -> number of times it appears in the full article text self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text)) # map of word -> number of times it appears in the given sentence self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
def tfidfifyAwards(): """This returns a dictionary of words that are used in awards along with a score that reflects how useful they are in identifying award names""" WordFreq = Counter() for award in OFFICIAL_AWARDS: award = cast_to_syn(award) aTokens = nltk.word_tokenize(award) for word in aTokens: WordFreq[word] += 1 DocFreq = Counter() for award in OFFICIAL_AWARDS: award = cast_to_syn(award) aTokens = nltk.word_tokenize(award) for word in WordFreq: if word in aTokens: DocFreq[word] += 1 retDict = dict((el, 0.0) for el in DocFreq) for word in DocFreq: retDict[word] = 1.0 / (DocFreq[word] ** (3.0 / 4.0)) """for word in media_words: retDict[word] = retDict[word]*1.5 for word in genre_words: retDict[word] = retDict[word]*1.5 """ return retDict
def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
def POS_Ngram(N, example_set, i): N_grams = dict() count = 0 for para in example_set: if i == 0: # get first sentence tokens = word_tokenize(para.first) else: # get ith sentence para.order_sentence() tokens = word_tokenize(para.ordered_sentences[i-1]) #tokens = word_tokenize(para.scrambled_sentences[int(para.correct_order[i-1])-1]) tagset = None #print(tokens) tokens = _pos_tag(tokens, tagset, tagger) tags = [x[1] for x in tokens] # take POS tags only n_tags = list(ngrams(tags, N)) for tag_set in n_tags: count += 1 if tag_set in N_grams: N_grams[tag_set] += 1 else: N_grams[tag_set] = 1 # first occurence of tagset # Normalize N_gram counts by total number of N grams for this set of sentences for ngram, num in N_grams.items(): N_grams[ngram] = num/count return N_grams
def colocation(windowSize, pos, context,dictionary): if windowSize<=0: return dictionary #going forward forward= context[:(pos)] f= forward[(-windowSize/2):] #going backward backward= context[pos+1:] b= backward[:windowSize/2] for item in f: key= "pre"+str(len(f)-f.index(item))+"-word" value= item dictionary[key]=value key= "pre"+str(len(f)-f.index(item))+"-pos" text = nltk.word_tokenize(item) value= nltk.pos_tag(text)[0][1] dictionary[key]=value for item in b: key= "fol"+str(b.index(item)+1)+"-word" value= item dictionary[key]=value key= "fol"+str(b.index(item)+1)+"-pos" text = nltk.word_tokenize(item) value= nltk.pos_tag(text)[0][1] dictionary[key]=value return dictionary
def extract_pos_pair(event_mention_1, event_mention_2): trigger1="" extent1="" trigger2="" extent2="" for one_anchor in event_mention_1.findall("anchor"): trigger1=one_anchor[0].text for one_anchor in event_mention_2.findall("anchor"): trigger2=one_anchor[0].text for one_extent in event_mention_1.findall("extent"): extent1=one_extent[0].text for one_extent in event_mention_2.findall("extent"): extent2=one_extent[0].text text1 = nltk.word_tokenize(extent1) dict1 = nltk.pos_tag(text1) for one_pair in dict1: if one_pair[0] in trigger1 or trigger1 in one_pair[0]: pos1=one_pair[1] break text2 = nltk.word_tokenize(extent2) dict2 = nltk.pos_tag(text2) for one_pair in dict2: if one_pair[0] in trigger2 or trigger2 in one_pair[0]: pos2=one_pair[1] break return (pos1, pos2)
def update(self, other): """Adds counts for elements in other""" if isinstance(other, self.__class__): self.n_sents += other.n_sents for x, n in other.items(): self[x] += n else: for sent in other: self.n_sents += 1 # import pdb;pdb.set_trace() if self.poscache is not None: if sent in self.poscache: tags = self.poscache[sent] else: self.poscache[sent] = tags = nltk.pos_tag( nltk.word_tokenize(sent)) else: tags = nltk.pos_tag(nltk.word_tokenize(sent)) for x in tags: tok, tag = x self[tag] += 1 if self.normalize: for x, n in self.items(): self[x] /= float(self.n_sents)
def read_liveqa(prefix = '../data/qalab-liveqa/dataset/qrels/', train = 'LiveQA2015-ver2.qrels', tokenize = True): import nltk f = open_file(prefix + train) np.random.seed(0) data_split = {0: [], 1 : [], 2 : []} ref_split = {0: [], 1 : [], 2 : []} for i,line in enumerate(f): l = line.strip().split('\t') if l[2] == '': first = " ? ".join(l[3].strip().split("?")) second = " . ".join(first.strip().split(".")) q = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ') split_id = np.random.choice([0,0,0,1,2]) continue label = int(l[2]) >= 3 first = " ? ".join(l[3].strip().split("?")) second = " . ".join(first.strip().split(".")) a = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ') data_split[split_id] += [(q,a,label,'','')] ref_split[split_id] += [(l[0],'0',l[0]+'_'+l[1]+'_'+str(i),str(int(label)))] return data_split[0],data_split[1],data_split[2],(ref_split[0],ref_split[1],ref_split[2])
def reading_level(full_text): #Clean the full_text full_text_clean = "" for char in full_text: if char == ".": full_text_clean += ". " else: full_text_clean += char #Language features import nltk words = nltk.word_tokenize(full_text_clean) n_sents = len(nltk.sent_tokenize(full_text_clean)) n_words = len(nltk.word_tokenize(full_text_clean)) #Count the syllables n_syll = 0 for word in words: n_syll += syllable_count(word) #Calculate the reading level #https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words) return round(grade_level,1)
def tag_chapter(chapter): text_raw = open("ofk_ch" + str(chapter) + ".txt").read() bad = ["ca", 'wo', 'thelmselves'] tokens = nltk.word_tokenize(text_raw) # tokens = [w.lower() for w in tokens] #change to lower case tokens = [re.sub('\.','',w) for w in tokens] #remove periods tokens = [w for w in tokens if w.isalpha()] #just keep words tokens = [w for w in tokens if not w in stopwords.words('english')] tokens = [w for w in tokens if len(w) > 1] tokens_freq = FreqDist(tokens) tokens_10 = [w for w in tokens if tokens_freq[w] > 20] tokens_10 = [w for w in tokens_10 if w not in bad] tokens_freq = FreqDist(tokens_10) tokens_table = [(w, importance(w, tokens_freq, tokens_10, text1, text2, text3, text4, text5, text6, text7, text8, text9)) for w in tokens_freq] a = lambda e1, e2: int(1000000*(e1[1] - e2[1])) sorted_table = sorted(tokens_table, cmp = a, reverse=True) # The number of elements you want to dump nums = 20 final_table = sorted_table[:nums]; pos_tuples = [] for x in final_table: token = nltk.word_tokenize(x[0]) pos_tuples = pos_tuples + nltk.pos_tag(token) list_pos = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] pos_names = ['Conjunction', 'Numerical', 'Determiner', 'Existential There', 'Foreign Word', 'Preposition or Conjunction, Subordinating', 'Adjective or Numeral', 'adjective, comparative', 'adjective, superlative', 'List Item Marker', 'Modal Auxiliary', ' Noun, Common', 'Noun, Proper, Singular', 'Noun, Proper, Plural', 'Noun, Common, Plural', 'Pre-Determiner', 'Genitive Marker', 'Pronoun, Personal', 'Pronoun, Possessive', 'Adverb', 'Adverb, Comparitive', 'Adverb, Superlative', 'Particle', 'Symbol', '"to" as preposition', 'Interjection', 'Verb, Base Form', 'Verb, Past Tense', 'Verb, Present Participle', 'Verb, Past Participle', 'Verb, Present Tense, Not 3rd Person Singular', 'Verb, Present Tense, 3rd Person Singular', 'WH-Determiner', 'WH-Pronoun', 'WH-Pronoun, Possessive', 'WH-Adverb'] result = [None] * len(list_pos) for p in range(len(list_pos)): result[p] = [dict(name=w[0]) for w in pos_tuples if w[1] == list_pos[p]] result_dict = [dict(name=pos_names[n], children=result[n]) for n in range(len(list_pos)) if result[n]] return result_dict
def parseFile(file): """ Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent information about the file) """ #print file bindings = [] # Load header file tokens = [] if (file['header'] != ''): with open(file['header'], 'r') as f: # Tokenize for line in f.readlines(): tokens += nltk.word_tokenize(line) # Parse tokens bindings += parseTokens( tokens, file, 'header' ) # Load source file tokens = [] if (file['source'] != ''): with open(file['source'], 'r') as f: # Tokenize for line in f.readlines(): tokens += nltk.word_tokenize(line) # Parse tokens bindings += parseTokens( tokens, file, 'source' ) return bindings
def tokenize(instances, lowercase=False): if lowercase: tokens = [nltk.word_tokenize(i.lower()) for i in instances] else: tokens = [nltk.word_tokenize(i) for i in instances] return tokens
def get_tasty(local_id): local = fs.local_info(local_id) reviews = local[3] menu = local[4] r_scores = score_reviews(reviews) m_items = fs.process_menu(menu) toreturn = [] r_reviews = [] for review in r_scores: (text, s) = review text_w = eliminate_stop_words(nltk.word_tokenize(text)) item_scores = [] for item in m_items: try: (name,desc,price) = item except ValueError: return ([(" $ "," ")],[(" "," $ ")]) text_m = eliminate_stop_words(nltk.word_tokenize(name)+nltk.word_tokenize(desc)) score = similarity(text_w, text_m) item_scores.extend( [( name+' $'+str(price) ,score)] ) item_refered = max(item_scores, key = operator.itemgetter(1)) if item_refered[1]>1 and s>0: toreturn.append( item_refered[0] ) r_reviews.append( (text, item_refered) ) counter = dict([(item, 0) for item in toreturn]) for item in toreturn: counter[item] += 1 counted_items = [ (item, counter[item]) for item in toreturn] print counted_items return (counted_items ,r_reviews )
def checkTypeWordCount(answer,question): count = 0 status = '' sum = 0 status1 = 'false' for word1 in word_tokenize(answer): if word1 == '.' or word1 == ',' or word1 == '\'' or word1 == '\"' or word1 == ':' or word1 == ';' or word1 == '?' or word1 == '/' or word1 == '\\' or word1 == '|' or word1 == ']' or word1 == '[' or word1 == '}' or word1 == '{' or word1 == '(' or word1 == ')' or word1 == '*' or word1 == '&' or word1 == '^' or word1 == '%' or word1 == '$' or word1 == '#' or word1 == '@' or word1 == '!' or word1 == '`' or word1 == '~' or word1 == '-' or word1 == '_' or word1 == '='or word1 == '+': print 'error' else: sum = sum +1 #print word1 print sum words_ans = word_tokenize(answer) words_qus = word_tokenize(question) if words_ans[0]=="NOTICE"or words_ans[0]=="Notice": print "Correct" count = count+0.25 else: status = "Wrong" for word in words_qus: if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words': if sum >= word: print word count = count+0.25 status1='true' if status1 == 'false': count = count+0.25 return count,status
def test(self): test_vector = [] test_emb = [] cList = [] reviews = ET.parse(self.test_path).getroot().findall('Review') for review in reviews: sentences = review[0] #get the sentences for sentence in sentences: if (len(sentence) > 1): opinions = sentence[1] if ( len(opinions) > 0): #check if there are aspects t = sentence[0].text text = word_tokenize(t.lower()) textC = word_tokenize(t) #tokenize, check for caps for opinion in opinions: test_emb.append(text) #store the tokenized words for the embedding's calculation centroid = self.calcCentroid(self.model,test_emb) #caclulate the centroid for each sentence for i in range(len(centroid)): #join the matrices tmp = centroid[i].tolist() test_vector.append(tmp) print print '---- End of Test ----' return test_vector
def nltk_filter(sent): b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = pos_tag(tokens) filtered_sent = ' ' for token in tokens: filtered_sent += '1'+token + ' ' # for pos_t in pos_tags: # if pos_t[1] in filterList: # #filtered_sent += stemmer.stem(pos_t[0]) + ' ' # filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = pos_tag(tokens) # filtered_sent = ' ' # for pos_t in pos_tags: # if pos_t[1] in filterList: # #filtered_sent += stemmer.stem(pos_t[0]) + ' ' # filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' for token in tokens: filtered_sent += '2' + token + ' ' return filtered_sent
def main(question, article): ddict = {} counts = get_counts() for tok in nltk.word_tokenize(article): ddict[tok] = ddict.get(tok, 0) + 1 vec = [] for tok in nltk.word_tokenize(question): # count in article tf = ddict.get(tok, 0) # total articles is 108 / number that have current token idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1) vec.append(tf*idf) largest = max(vec) normalized = map(lambda y: y/largest, vec) finDic = {} for word,i in enumerate(nltk.word_tokenize(question)): finDic[word] = normalized[i] print finDic return finDic
def PushDataPair(data, database): last = len(database['Q'].keys()) for pair in data: database['Q'][last] = nltk.word_tokenize(pair['question']) database['A'][last] = nltk.word_tokenize(pair['answer']) last += 1 return database
def build_s(data): ''' Compute the context vector for each lexelt :param data: dict with the following structure: { lexelt: [(instance_id, left_context, head, right_context, sense_id), ...], ... } :return: dict s with the following structure: { lexelt: [w1,w2,w3, ...], ... } ''' s = {} # implement your code here for key,value in data.items(): for i in value: tokens_left = nltk.word_tokenize(i[1]) tokens_right = nltk.word_tokenize(i[3]) left = [w for w in tokens_left if w not in string.punctuation][-window_size:] right = [w for w in tokens_right if w not in string.punctuation][:window_size] context = left + right if key not in s: s[key]=[] for word in context: if word not in s[key]: s[key].append(word) return s
def synsym(s1,s2): ts0 = nltk.pos_tag(nltk.word_tokenize(s1)) ts1 = nltk.pos_tag(nltk.word_tokenize(s2)) # adj jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS'] jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS'] if len(jj0) == 0 or len(jj1) ==0: jjps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) # noum jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT'] jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT'] if len(jj0) == 0 or len(jj1) ==0: nps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) nps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) # verb jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ'] jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ'] if len(jj0) == 0 or len(jj1) ==0: vps = 0 else: v1 = makeFeatureVec(jj0,model,300) v2 = makeFeatureVec(jj1,model,300) vps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2)) return [jjps,nps,vps]
def paragraph_features(paragraph_sents): global count count += 1 print '\r', count, if FEATURE == FEAT_CONTAINS: paragraph_words = set( sents_to_words(paragraph_sents) ) elif FEATURE == FEAT_LINKED_TITLES: paragraph_words = ' '.join(paragraph_sents) elif FEATURE == FEAT_FIRST_SENT: paragraph_words = nltk.word_tokenize( paragraph_sents[0] ) elif FEATURE == FEAT_BEGIN_SENT: paragraph_words = { nltk.word_tokenize(sent)[0] for sent in paragraph_sents } else: paragraph_words = None print 'FEATURE NOT SUPPORTED' exit() features = dict() for word in word_features: features[word_features[word]] = ( word in paragraph_words ) return features
def next_note(tokenizer): print 'SemEval data' for semeval_file in semeval_files: print 'File', semeval_file with open(semeval_file, 'r') as f: st = [] for line in f: st += [line.strip()] text = read_visit_sem(st) text = tokenizer.tokenize(text) for sent in text: yield nltk.word_tokenize(sent.lower()) print 'MIMIC data' for notes_file in subset(notes_files, 15): # 15 random MIMIC files print 'File', notes_file try: with open(notes_file, 'r') as f: ct = 0 st = [] for line in f: ct += 1 if ct % 50000 == 0: print ct if line.strip() == '</VISIT>': text = read_visit(st) text = tokenizer.tokenize(text) for sent in text: yield nltk.word_tokenize(sent.lower()) st = [] elif line.strip() != '<VISIT>': st += [line.strip()] except IOError: pass
def build_s(data): """ Compute the context vector for each lexelt :param data: dic with the following structure: { lexelt: [(instance_id, left_context, head, right_context, sense_id), ...], ... } :return: dic s with the following structure: { lexelt: [w1,w2,w3, ...], ... } """ s = {} # implement your code here for lexelt in data: words = set() for instance in data[lexelt]: left_context = word_tokenize(instance[1].strip()) for token in left_context[-window_size:]: if token not in puncts: words.add(token) right_context = word_tokenize(instance[3].strip()) for token in right_context[:window_size]: if token not in puncts: words.add(token) s[lexelt] = list(words) return s
def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger('/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar',encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def getCategories(): return categories # Uma lista de tuplas com as palavras das sentenças e o nome da categoria docs = [] for each_category in data.keys(): for each_sentence in data[each_category]: # Remove a pontuação each_sentence = remove_punctuation(each_sentence) print(each_sentence) # Extrai as palavras de cada sentença e armazena na lista w = nltk.word_tokenize(each_sentence) print("\nPalavras tokenizadas: ", w) words.extend(w) docs.append((w, each_category)) # Stem de cada palavra, converte para minúsculo e remove duplicidades words = [stemmer.stem(w.lower()) for w in words] words = sorted(list(set(words))) # Cria as listas para os dados de treino training = [] output = [] # Cria um array para o output output_empty = [0] * len(categories)
testMusicians = list() for genre in testLists: musicians = get_musicians(genre) for musician in musicians: testMusicians.append(musician) all_text = '' for entry in trainingMusicians: text = get_page_text(entry[1]) entry[1] = text all_text += text striptext = all_text.replace('\n\n', ' ') striptext = striptext.replace('\n', ' ') documents = [entry[1] for entry in trainingMusicians] sentences = sent_tokenize(striptext) words = word_tokenize(striptext) texts = [[word for word in document.lower().split() if word not in STOPWORDS and word.isalnum()] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary) for entry in testMusicians: text = get_page_text(entry[1]) vec_bow = dictionary.doc2bow(text.lower().split()) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(lsi[corpus]) sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) entry[1] = trainingMusicians[sims[0][0]][0]
def SentenceAnalysis(self, fulltext, textfdist): #debug_print("Answer.SentenceAnalysis(_,_)", level=5) ans_sentencelist = [] # Perform text normalization, while preserving offsets text = fulltext.replace('\n', ' ') self.ans_text = text # Separate text into sentences # TODO: See if NLTK sentence tokenizer works better ## OLD: p = re.compile(r'.+\.') p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]') ## OLD: keysen = p.findall(text) # # Commented bellow logic as regex is not working properly.. # # offset = 0 # # keysen = [] # # starts = [] # # ends = [] # # while (len(text) > 0): # # match = p.search(text) # # if not match: # # break # # keysen.append(match.group(0)) # # starts.append(offset + match.start(0)) # # ends.append(offset + match.end(0)) # # text = text[match.end(0) : ] # # offset += match.end(0) keysen = sent_tokenize(text) remove_point_patt = re.compile(r"^\d.") keysen = [x for x in keysen if not remove_point_patt.search(x.strip())] # Create hash entries for each sentence sen_no = 0 for sen in keysen: #debug_print("sen: " + str(sen), level=6) sen_no += 1 # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found. # Note: An optional part-of-speech tag prefix can be included. # TODO: Isolate text preprocessing code in a separate function sen_text = remove_latex(sen) text = nltk.word_tokenize(sen_text) part_of_speech_tagged_words = nltk.pos_tag(text) text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words) text_words_proper = list(word for word in text_words if word) if self.use_part_of_speech: # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) if word] ans_sentencelist.append({'StuS': sen, 'StuWords': text_words_proper, 'No': sen_no}) # Compute TF/IDF-style weighting scheme for sentence in ans_sentencelist: #debug_print("sentence: " + str(sentence), level=6) fdist = nltk.FreqDist(sentence['StuWords']) try: max_freq = max([f for f in fdist.values()]) except ValueError: #print_stderr("Exception in Answer.SentenceAnalysis: " + str(sys.exc_info())) max_freq = 1 log_max_freq = math.log(max_freq) if (max_freq > 1) else 1 senvec = {} for word in sorted(textfdist): if fdist[word]: wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords']) if (self.use_true_tf_idf): tf = 1 + math.log(fdist[word]) / log_max_freq idf = 1 + math.log(len(keysen) / wordfreq) senvec[word] = tf * idf else: senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq) else: senvec[word] = 0 sentence['StuSVec'] = senvec ##debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6) ##debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7) return ans_sentencelist
try: with open('model/data.pickle', 'rb') as f: words, labels, training, output = pickle.load(f) # Creating an empty list to store some values. except: words = [] labels = [] docs_x = [] docs_y = [] # Creating a loop that would stem the words in the json dataset, # and append them into the list created above. for intent in data['intents']: for pattern in intent['patterns']: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent['tag']) # Creating an if statement to append the word that are not present in, # the labels list into the label list. if intent['tag'] not in labels: labels.append(intent['tag']) # Stemming the words and converting them into lowercase alphabets, # then setting an if statement to remove the ? character. words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) # Sorting the value for the words in labels and saving them into a
def RingNormalize(arrow): return RingPrecious( nltk.word_tokenize(arrow.lower().translate(remove_dot_ring)))
nltk.download() # In[ ]: import nltk import numpy as np import random import string #To process standard python strings # In[ ]: f = open('/Users/Henry/lordofthering.txt', 'r', errors='ignore') raw = f.read() raw = raw.lower() # converts to lowercase sent_precious = nltk.sent_tokenize(raw) # converts to list of sentences word_precious = nltk.word_tokenize(raw) # converts to list of words # In[ ]: ring = nltk.stem.WordNetLemmatizer() #WordNet is a semantically-oriented dictionary of English included in NLTK. def RingPrecious(swords): return [ring.lemmatize(sword) for sword in swords] remove_dot_ring = dict((ord(dot), None) for dot in string.punctuation) def RingNormalize(arrow):
nltk.download() paragraph = """I have three visions for India. In 3000 years of our history, people from all over the world have come and invaded us, captured our lands, conquered our minds. From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British, the French, the Dutch, all of them came and looted us, took over what was ours. Yet we have not done this to any other nation. We have not conquered anyone. We have not grabbed their land, their culture, their history and tried to enforce our way of life on them. Why? Because we respect the freedom of others.That is why my first vision is that of freedom. I believe that India got its first vision of this in 1857, when we started the War of Independence. It is this freedom that we must protect and nurture and build on. If we are not free, no one will respect us. My second vision for India’s development. For fifty years we have been a developing nation. It is time we see ourselves as a developed nation. We are among the top 5 nations of the world in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling. Our achievements are being globally recognised today. Yet we lack the self-confidence to see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect? I have a third vision. India must stand up to the world. Because I believe that unless India stands up to the world, no one will respect us. Only strength respects strength. We must be strong not only as a military power but also as an economic power. Both must go hand-in-hand. My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material. I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. I see four milestones in my career""" # Tokenizing sentences sentences = nltk.sent_tokenize(paragraph) # Tokenizing words words = nltk.word_tokenize(paragraph)
#print ("%s sentences of training data" % len(training_data)) # capture unique stemmed words in the training corpus corpus_words = {} class_words = {} # turn a list into a set (of unique items) and then a list again (this removes duplicates) classes = list(set([a['class'] for a in training_data])) for c in classes: # prepare a list of words within each class class_words[c] = [] # loop through each sentence in our training data for data in training_data: # tokenize each sentence into words for word in nltk.word_tokenize(data['sentence']): # ignore a some things if word not in ["?", "'s"]: # stem and lowercase each word stemmed_word = stemmer.stem(word.lower()) # have we not seen this word already? if stemmed_word not in corpus_words: corpus_words[stemmed_word] = 1 else: corpus_words[stemmed_word] += 1 # add the word to our words in class list class_words[data['class']].extend([stemmed_word]) # we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality) #print ("Corpus words and counts: %s \n" % corpus_words)
# Read the data and append SENTENCE_START and SENTENCE_END tokens import os os.chdir(r"C:\Users\s6324900\Desktop\Deep learning\RNN") print "Reading CSV file..." with open('reddit-comments-2015-08.csv', 'rb') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # Split full comments into sentences sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader]) # Append SENTENCE_START and SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] print "Parsed %d sentences." % (len(sentences)) # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(vocabulary_size-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token
# TODO - print video title video_title = doc.find("div", attrs={"class":"cnnVidFooter"}).get_text() print("Video title: ", video_title) # TODO - print article content container = doc.find("div", id="storytext") content_list = [p.string for p in container.findAll("p") if p.string] content = "\n".join(content_list) print("Content: ", content) # 1-2. Tokenize news article content by words # TODO - tokenize article content tokenized_words = nltk.word_tokenize(content) print(tokenized_words) # 1-3. POS-Tag tokenized words and sort POS by frequency # TODO - POS_Tag tokenized words tagged_list = nltk.pos_tag(tokenized_words) print(tagged_list) # TODO - sort POS by frequency from collections import Counter counter = Counter([el[1] for el in tagged_list]) print(counter) # for tag in tagged_list: # pass
# Train the unigram tagger uni_tag = ut(cess_sents) X,y = ext_ft(cess_sents) text = "" f = open('PLIEGO_EJEMPLO.docx', 'rb') document = Document(f) for i in document.paragraphs: text +=i.text f.close() print("=========== PRIMEROS 45 ========") print(text[:45]) # obtengo los tokens tokens = word_tokenize(text) words = tokens[:250] print("=========== TOKENS 50 ========") print(words) """archivo_salida = open("my_tokens.txt","w") for word in tokens: print(word,file=archivo_salida) """ text = nltk.Text(tokens) print(type(tokens)) print(len(tokens)) print(tokens[:10]) text.collocations()
def predict_sentence(self, sentence): sample = [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]] probs = [ model.test(sample) for model in self.hmm_models ] return probs.index(max(probs))
def normalize(text): '''Return normalized and lemmatized tokens in text.''' text = removePunct(text) tokens = nltk.word_tokenize(text) lemmatizer = nltk.stem.WordNetLemmatizer() return [lemmatizer.lemmatize(token) for token in tokens]
import urllib2 import os,sys import nltk import bs4 from bs4 import BeautifulSoup from nltk import word_tokenize, pos_tag #Storing the link in a variable hackernews="https://news.ycombinator.com/" page=urllib2.urlopen(hackernews) soup=BeautifulSoup(page, "html.parser") page_content=soup.prettify() #Storing the file in a variable sent = open("filename.txt").read() #Tokenizing the words stored in the file words=nltk.word_tokenize(sent) #Tagging the words of the files based on the parts of speech tags=nltk.pos_tag(words) #Taking input from the user word_key=raw_input("Enter the any keyword to be searched : ") #Searching the nouns(keywords) from the file nouns=[word for word,pos in tags \ if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] #Checking wether the keyword entered by the user is present in the file with open("filename.txt") as openfile: for line in openfile: for part in line.split(): if word_key in part: all_links=soup.find_all("a") for link in all_links: print link.get("href")
def clean_up_sentence(sentence): # tokenize the pattern sentence_words = nltk.word_tokenize(sentence) # stem each word sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] return sentence_words
line = ast.literal_eval(line) # print line # exit(1) json_data = json.loads(json.dumps(line)) dict[i] = json_data i += 1 l = [] wd = [] for i in dict: for j in dict[i]: if j == 'reviewText': l.append(str(dict[i][j])) for i in l: text = nltk.word_tokenize(i) tagged = nltk.pos_tag(text) nouns = [word for word, pos in tagged if (pos == 'NN') ] # or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] wd.append(review_word(' '.join(nouns))) # PROGRESS : List of list of nouns in a review dictionary = corpora.Dictionary(wd) corpus = [dictionary.doc2bow(text) for text in wd] features = [] if __name__ == "__main__": ldamodel = models.LdaModel(corpus, id2word=dictionary,
def tag_sent(raw_file,new_file,labelfile,max_token=50): train_json_file = open(new_file, 'w') file = open(raw_file, 'r') sentences_0 = file.readlines() c = 0 Tkk = [] ii = 0 Tkk = {} vV = [] Mlabel = {} Alabel = {} count_r = {} label = [] f = open(labelfile, 'r') fr = f.readlines() label = [line.strip('\n') for line in fr] f.close() for line in sentences_0: c += 1 kk = 1 count_r[c - 1] = 0 Tkk[c - 1] = 0 if not c % 10000: print(c) sent = json.loads(line.strip('\r\n')) flag = 0 sentText = str(unicodedata.normalize('NFKD', sent['sentText']).encode('ascii', 'ignore')).rstrip('\n').rstrip('\r') #sentText=sent['sentText'] python3 tokens = nltk.word_tokenize(sentText) tags=["O"]*len(tokens) emIndexByText = {} for em in sent['entityMentions']: emText = unicodedata.normalize('NFKD', em['text']).encode('ascii', 'ignore') # emText=em['text'] tokens1 = tokens em1 = emText.split() flagE = True if emIndexByText.__contains__(emText): flagE = False while flagE: start, end = find_index(tokens1, em1) if start != -1 and end != -1: tokens1 = tokens1[end:] if emText not in emIndexByText: emIndexByText[emText] = [(start, end)] elif not emIndexByText[emText].__contains__((start, end)): offset = emIndexByText[emText][-1][1] emIndexByText[emText].append((start + offset, end + offset)) else: break for rm in sent['relationMentions']: if not rm['label'].__eq__('None') and label.__contains__(rm['label']): rmlabel = rm["label"] if not Alabel.__contains__(rmlabel): Alabel[rmlabel] = [c - 1] else: Alabel[rmlabel].append(c - 1) em1 = unicodedata.normalize('NFKD', rm['em1Text']).encode('ascii', 'ignore') em2 = unicodedata.normalize('NFKD', rm['em2Text']).encode('ascii', 'ignore') # em1 = rm["em1Text"] #python3 # em2=rm['em2Text'] #python3 if emIndexByText.__contains__(em1) and emIndexByText.__contains__(em2): ind1 = emIndexByText[em1] ind2 = emIndexByText[em2] minind = len(tokens) labelindex = [] for i1ind, i1 in enumerate(ind1): for i2ind, i2 in enumerate(ind2): if (i2[0] - i1[1]) * (i2[1] - i1[0]) > 0: if minind > abs(i2[1] - i1[1]): minind = abs(i2[1] - i1[1]) labelindex = [i1ind, i2ind] if labelindex: i1ind = labelindex[0] i2ind = labelindex[1] start1 = ind1[i1ind][0] end1 = ind1[i1ind][1] start2 = ind2[i2ind][0] end2 = ind2[i2ind][1] tag1Previous = [] tag2Previous = [] if end1 - start1 == 1: tag1Previous.append(rmlabel + "__E1S") elif end1 - start1 == 2: tag1Previous.append(rmlabel + "__E1B") tag1Previous.append(rmlabel + "__E1L") else: tag1Previous.append(rmlabel + "__E1B") for ei in range(start1 + 1, end1 - 1): tag1Previous.append(rmlabel + "__E1I") tag1Previous.append(rmlabel + "__E1L") if end2 - start2 == 1: tag2Previous.append(rmlabel + "__E2S") elif end2 - start2 == 2: tag2Previous.append(rmlabel + "__E2B") tag2Previous.append(rmlabel + "__E2L") else: tag2Previous.append(rmlabel + "__E2B") for ei in range(start2 + 1, end2 - 1): tag2Previous.append(rmlabel + "__E2I") tag2Previous.append(rmlabel + "__E2L") while True: valid1 = True vT1 = 0 for ei in range(start1, end1): if not tags[ei].__eq__('O'): valid1 = False break if not valid1: valid1 = True vT1 = 1 for ei in range(start1, end1): if not tags[ei].__eq__(tag1Previous[ei - start1]): valid1 = False vT1 = 0 break valid2 = True vT2 = 0 for ei in range(start2, end2): if not tags[ei].__eq__('O'): valid2 = False break if not valid2: valid2 = True vT2 = 1 for ei in range(start2, end2): if not tags[ei].__eq__(tag2Previous[ei - start2]): valid2 = False vT2 = 0 break if valid1 and valid2: for ei in range(start2, end2): tags[ei] = tag2Previous[ei - start2] for ei in range(start1, end1): tags[ei] = tag1Previous[ei - start1] Tkk[c - 1] = kk if not (vT1 and vT2): ii += 1 count_r[c - 1] += 1 if not Mlabel.__contains__(rmlabel): Mlabel[rmlabel] = [c - 1] else: Mlabel[rmlabel].append(c - 1) flag = 1 if (vT1 or vT2) and not (vT1 and vT2): vV.append(c - 1) break else: start1 += len(tokens) end1 += len(tokens) start2 += len(tokens) end2 += len(tokens) if end2 > kk * len(tokens): kk += 1 for ki in range(len(tokens)): tags.append('O') newsent = dict() newsent['tokens'] = tokens newsent['tags'] = tags newsent['lentags/lentokens'] = kk * flag train_json_file.write(json.dumps(newsent) + '\n') train_json_file.close() return Tkk, vV, ii, Alabel, count_r, Mlabel
def parseInput(userInput): '''Uses nltk's pos_tag to tag the parts of speech of the users input''' tokens = nltk.word_tokenize(userInput) pos = nltk.pos_tag(tokens) return pos
import nltk from nltk.corpus import stopwords import string import re import pdb from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer # nltk.download('wordnet') stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() with open("comments") as f: t = f.read() t = re.sub(r'http\S+', '', t) tokens = [nltk.word_tokenize(item) for item in t.split('\n\n\n\n')] # pdb.set_trace() # print(tokens) stop_words = stopwords.words('english') stop_words.append('I') cleantext = [] with open("comments2",'w') as file: for comment in tokens: for word in comment: word = re.sub('[^A-Za-z0-9]+', '', word) if (word not in stop_words and word!='' and word!='\n'): # print(word)
def pos_tag_words(self): pos_text = nltk.pos_tag(nltk.word_tokenize( self.output['preprocessed'])) self.output['pos_tagged'] = " ".join( [pos + "-" + word for word, pos in pos_text])
def read_jsonfile(self, json_file, create_vocab, is_test, test_state): # print 'json file ', json_file try: with open(json_file, "rb") as file: dialogue = orjson.loads(file.read()) #dialogue = json.load(open(json_file)) except json.decoder.JSONDecodeError: return None filter(None, dialogue) # dialogue_multimodal is a list of training instances, each of len max_utter, and each ending with a system response. Whenever a dialogue has context less than max_utter, it is accordingly padded dialogue_vocab = {} dialogue_multimodal = [] if self.task_type == "text": dialogue_context_text_task_text = [] dialogue_context_image_task_text = [] dialogue_target_text_task_text = [] if self.task_type == "image": dialogue_context_text_task_image = [] dialogue_context_image_task_image = [] dialogue_target_image_task_image = [] dialogue_instance_multimodal = [] for utterances in dialogue: if utterances is None or len(utterances) == 0: continue if not isinstance(utterances, list): utterances = [utterances] for utterance in utterances: if utterance is None: continue if not isinstance(utterance, dict): print('impossible ', utterance, json_file) raise Exception('error in reading dialogue json') continue speaker = utterance['speaker'] if 'images' not in utterance[ 'utterance'] or 'nlg' not in utterance['utterance']: continue images = utterance['utterance']['images'] nlg = utterance['utterance']['nlg'] if nlg is not None: nlg = nlg.strip() # .encode('utf-8') if nlg is None: nlg = "" try: nlg = nlg.lower() except AttributeError: pass try: nlg = nlg.replace("|", "") except TypeError: pass nlg_words = nltk.word_tokenize(nlg) # nlg_words = [x.encode('utf-8') for x in nlg_words] if create_vocab: self.word_counter.update(nlg_words) dialogue_instance_multimodal.append({ 'images': images, 'nlg': nlg }) if speaker == "system" and ( test_state is None or is_test is False or (last_question_type is not None and test_state is not None and is_test is True and test_state in last_question_type)): last_utterance = dialogue_instance_multimodal[-1] # print 'dialogue instance ',[x['nlg'] for x in dialogue_instance_multimodal[:-1]] # print 'last utterance ', last_utterance['nlg'] # print '' if self.task_type == "text" and ( last_utterance['nlg'] is None or last_utterance['nlg'] == ""): continue if self.task_type == "image" and ( last_utterance['images'] is None or len(last_utterance['images']) == 0): continue padded_clipped_dialogue = self.pad_or_clip_dialogue( dialogue_instance_multimodal) if len(padded_clipped_dialogue) != (self.max_utter + 1): raise Exception( 'some problem with dialogue instance, len != max_utter+1' ) # dialogue_instance_task_test is a max_utter length list of utterances where the last utterance in the list is the target utterance dialogue_instance_text_context = [ x['nlg'] if x['nlg'] is not None else '' for x in padded_clipped_dialogue[:-1] ] # dialogue_instance_task_image is a max_utter length list of image-lists where the last entry in the list is a single image instead of a list and it is the target image dialogue_instance_image_context = [ x['images'] if x['images'] is not None else [] for x in padded_clipped_dialogue[:-1] ] # print 'dialogue_instance_text_context ', dialogue_instance_text_context # print '' # print 'dialogue_instance_image_context ', dialogue_instance_image_context if len(dialogue_instance_text_context) != self.max_utter: raise Exception( 'len(dialogue_instance_text_context)!=self.max_utter' ) if len(dialogue_instance_image_context) != self.max_utter: raise Exception( 'len(dialogue_instance_image_context)!=self.max_utter' ) if self.task_type == "text": dialogue_target_text = dialogue_instance_multimodal[ -1]['nlg'] dialogue_instance_context_text_task_text = copy.deepcopy( dialogue_instance_text_context) dialogue_instance_context_image_task_text = copy.deepcopy( dialogue_instance_image_context) dialogue_context_text_task_text.append( dialogue_instance_context_text_task_text) dialogue_context_image_task_text.append( dialogue_instance_context_image_task_text) dialogue_target_text_task_text.append( dialogue_target_text) if self.task_type == "image": dialogue_target_images = dialogue_instance_multimodal[ -1]['images'] for image in images: dialogue_instance_context_text_task_image = copy.deepcopy( dialogue_instance_text_context) dialogue_instance_context_image_task_image = copy.deepcopy( dialogue_instance_image_context) dialogue_context_text_task_image.append( dialogue_instance_context_text_task_image) dialogue_context_image_task_image.append( dialogue_instance_context_image_task_image) dialogue_target_image_task_image.append(image) if 'question-type' in utterance and test_state is not None: last_question_type = utterance['question-type'] elif speaker != "system": last_question_type = None if self.task_type == "text": with open(self.dialogue_context_text_task_text_file, 'a') as fp: for dialogue_instance in dialogue_context_text_task_text: dialogue_instance = '|'.join(dialogue_instance) fp.write(dialogue_instance + '\n') with open(self.dialogue_context_image_task_text_file, 'a') as fp: for dialogue_instance in dialogue_context_image_task_text: image_context = None if len(dialogue_instance) != self.max_utter: raise Exception( 'len(dialogue_instance_image_context)!=self.max_utter' ) for images in dialogue_instance: if image_context is None: try: image_context = ",".join(images) except TypeError: # If images = [None] image_context = ",".join([]) else: try: image_context = image_context + "|" + ",".join( images) except TypeError: # If images = [None] image_context = image_context + "|" + ",".join( []) if len(image_context.split("|")) != self.max_utter: raise Exception( 'len(dialogue_instance_image_context)!=self.max_utter' ) fp.write(image_context + '\n') with open(self.dialogue_target_text_task_text_file, 'a') as fp: for dialogue_instance in dialogue_target_text_task_text: fp.write(dialogue_instance + '\n') if self.task_type == "image": with open(self.dialogue_context_text_task_image_file, 'a') as fp: for dialogue_instance in dialogue_context_text_task_image: dialogue_instance = '|'.join(dialogue_instance) fp.write(dialogue_instance + '\n') with open(self.dialogue_context_image_task_image_file, 'a') as fp: for dialogue_instance in dialogue_context_image_task_image: image_context = None if len(dialogue_instance) != self.max_utter: raise Exception( 'len(dialogue_instance_image_context)!=self.max_utter' ) for images in dialogue_instance: if image_context is None: image_context = ",".join(images) else: image_context = image_context + "|" + ",".join( images) if len(image_context.split("|")) != self.max_utter: raise Exception( 'len(dialogue_instance_image_context)!=self.max_utter' ) fp.write(image_context + '\n') with open(self.dialogue_target_image_task_image_file, 'a') as fp: for dialogue_instance in dialogue_target_image_task_image: fp.write(dialogue_instance + '\n')
def trainModel(data, stemmer): #print(data["intents"]) try: with open("data.pickle", "rb") as f: words,labels, training, output = pickle.load(f) except: words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: if intent["tag"] not in labels: labels.append(intent["tag"]) for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag =[] wrds = [stemmer.stem(w) for w in doc] for w in words: if w in wrds: bag.append(1) else: bag.append(0) output_row = out_empty[:] output_row[labels.index(docs_y[x])] = 1 training.append(bag) output.append(output_row) training = numpy.array(training) output = numpy.array(output) with open("data.pickle", "wb") as f: pickle.dump((words,labels, training, output),f) tensorflow.reset_default_graph() net = tflearn.input_data(shape=[None, len(training[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(output[0]), activation="softmax") net = tflearn.regression(net) model = tflearn.DNN(net) try: model.load("./model.tflearn") except RuntimeError: model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True) model.save("./model.tflearn") return (words,labels, model)
def process_questions(args): ''' Encode question tokens''' print('Loading data') with open(args.annotation_file, 'r') as dataset_file: instances = json.load(dataset_file) # Either create the vocab or load it from disk if args.mode in ['train']: print('Building vocab') answer_cnt = {} for instance in instances: answer = instance['answer'] answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1} answer_counter = Counter(answer_cnt) frequent_answers = answer_counter.most_common(args.answer_top) total_ans = sum(item[1] for item in answer_counter.items()) total_freq_ans = sum(item[1] for item in frequent_answers) print("Number of unique answers:", len(answer_counter)) print("Total number of answers:", total_ans) print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) for token, cnt in Counter(answer_cnt).most_common(args.answer_top): answer_token_to_idx[token] = len(answer_token_to_idx) print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) question_token_to_idx = {'<NULL>': 0, '<UNK>': 1} for i, instance in enumerate(instances): question = instance['question'].lower()[:-1] for token in nltk.word_tokenize(question): if token not in question_token_to_idx: question_token_to_idx[token] = len(question_token_to_idx) print('Get question_token_to_idx') print(len(question_token_to_idx)) vocab = { 'question_token_to_idx': question_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, 'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1} } print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset)) with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f: json.dump(vocab, f, indent=4) else: print('Loading vocab') with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f: vocab = json.load(f) # Encode all questions print('Encoding data') questions_encoded = [] questions_len = [] question_ids = [] video_ids_tbw = [] video_names_tbw = [] all_answers = [] for idx, instance in enumerate(instances): question = instance['question'].lower()[:-1] question_tokens = nltk.word_tokenize(question) question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) question_ids.append(idx) im_name = instance['video_id'] video_ids_tbw.append(im_name) video_names_tbw.append(im_name) if instance['answer'] in vocab['answer_token_to_idx']: answer = vocab['answer_token_to_idx'][instance['answer']] elif args.mode in ['train']: answer = 0 elif args.mode in ['val', 'test']: answer = 1 all_answers.append(answer) max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) glove_matrix = None if args.mode == 'train': token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} print("Load glove from %s" % args.glove_pt) glove = pickle.load(open(args.glove_pt, 'rb')) dim_word = glove['the'].shape[0] glove_matrix = [] for i in range(len(token_itow)): vector = glove.get(token_itow[i], np.zeros((dim_word,))) glove_matrix.append(vector) glove_matrix = np.asarray(glove_matrix, dtype=np.float32) print(glove_matrix.shape) print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode)) obj = { 'questions': questions_encoded, 'questions_len': questions_len, 'question_id': question_ids, 'video_ids': np.asarray(video_ids_tbw), 'video_names': np.array(video_names_tbw), 'answers': all_answers, 'glove': glove_matrix, } with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f: pickle.dump(obj, f)
def binarize_corpora(self, dialogue_file_text, dialogue_file_image, dialogue_target_file, task_type, dialogue_pkl_file): binarized_corpus = [] binarized_corpus_text_context = [] binarized_corpus_image_context = [] binarized_corpus_target = [] unknowns = 0. num_terms = 0. freqs = collections.defaultdict(lambda: 0) df = collections.defaultdict(lambda: 0) num_instances = 0 with open(dialogue_file_text) as textlines, open( dialogue_file_image) as imagelines, open( dialogue_target_file) as targetlines: for text_context, image_context, target in zip( textlines, imagelines, targetlines): text_context = text_context.lower().strip() image_context = image_context.strip() target = target # print 'text_content ', text_context # print 'image_content ', image_context # print 'target ', target # print '' num_instances += 1 if num_instances % 10000 == 0: print('finished ', num_instances, ' instances') utterances = text_context.split('|') binarized_text_context = [] for utterance in utterances: try: utterance_words = nltk.word_tokenize(utterance) except: utterance_words = utterance.split(' ') utterance_words = self.pad_or_clip_utterance( utterance_words) if self.end_word_symbol not in utterance_words: print('utterance ', utterance) print('utterance words ', utterance_words) raise Exception('utterance does not have end symbol') utterance_word_ids = [] for word in utterance_words: word_id = self.vocab_dict.get(word, self.unknown_word_id) utterance_word_ids.append(word_id) unknowns += 1 * (word_id == self.unknown_word_id) freqs[word_id] += 1 if self.end_word_id not in utterance_word_ids: print('utterance word ids ', utterance_word_ids) raise Exception( 'utterance word ids does not have end word id') num_terms += len(utterance_words) unique_word_indices = set(utterance_word_ids) for word_id in unique_word_indices: df[word_id] += 1 binarized_text_context.append(utterance_word_ids) if len(binarized_text_context) != self.max_utter: raise Exception( 'binarized_text_context should be a list of length max_utter, found length ', len(binarized_text_context)) binarized_image_context = [ self.pad_or_clip_images(x.split(",")) for x in image_context.split('|') ] if len(binarized_image_context) != self.max_utter: raise Exception( 'binarized_image_context should be a list of length max_utter, found length ', len(binarized_image_context)) binarized_target = None if task_type == "text": utterance = target try: utterance_words = nltk.word_tokenize(utterance) except: utterance_words = utterance.split(' ') utterance_words = self.pad_or_clip_utterance( utterance_words) if self.end_word_symbol not in utterance_words: print('utterance ', utterance) print('utterance words ', utterance_words) raise Exception('utterance does not have end symbol') utterance_word_ids = [] for word in utterance_words: word_id = self.vocab_dict.get(word, self.unknown_word_id) utterance_word_ids.append(word_id) unknowns += 1 * (word_id == self.unknown_word_id) freqs[word_id] += 1 if self.end_word_id not in utterance_word_ids: print('utterance word ids ', utterance_word_ids) raise Exception( 'utterance word ids does not have end word id') num_terms += len(utterance_words) unique_word_indices = set(utterance_word_ids) for word_id in unique_word_indices: df[word_id] += 1 binarized_target = utterance_word_ids if task_type == "image": binarized_target = target # binarized_corpus_text_context.append(binarized_text_context) # binarized_corpus_image_context.append(binarized_image_context) # binarized_corpus_target.append(binarized_target) binarized_corpus.append([ binarized_text_context, binarized_image_context, binarized_target ]) # binarized_corpus = [binarized_corpus_text_context, binarized_corpus_image_context, binarized_corpus_target] self.safe_pickle(binarized_corpus, dialogue_pkl_file) if not os.path.isfile(self.vocab_file): self.safe_pickle([(word, word_id, freqs[word_id], df[word_id]) for word, word_id in self.vocab_dict.items()], self.vocab_stats_file) inverted_vocab_dict = { word_id: word for word, word_id in self.vocab_dict.items() } self.safe_pickle(inverted_vocab_dict, self.vocab_file) print('dumped vocab in ', self.vocab_file) self.logger.info("Number of unknowns %d" % unknowns) self.logger.info("Number of terms %d" % num_terms) self.logger.info( "Mean document length %f" % float(sum(map(len, binarized_corpus)) / len(binarized_corpus))) self.logger.info( "Writing training %d dialogues (%d left out)" % (len(binarized_corpus), num_instances + 1 - len(binarized_corpus)))
'cue-words-score'] = (matches / len(sentence_mapper[sentence])) #This metric is very biased - if only 1 cue-word is given then its presence #will give a score of 1 - highly biases the sentences. #May be try to put a metric like - length of sentence on the denominator #and num words which have similarity more than .50 with the cue words - currently only cue-words #----------Rating a sentence according to position and length------------ #Using Barrera and Verma's first model to score sentence based on the position total_sentences = len(original_sentences) alpha = 2 for index, sentence in enumerate(original_sentences): vector_space.loc[sentence, 'position-score'] = (np.cos( (2 * np.pi * index) / (total_sentences - 1)) + alpha - 1) / (alpha) vector_space.loc[sentence, 'length-score'] = len(word_tokenize(sentence)) mean = np.mean(vector_space['length-score']) std_dev = np.sqrt(np.var(vector_space['length-score'])) max_val = max( np.abs(min(vector_space['length-score']) - mean) / std_dev, np.abs(max(vector_space['length-score']) - mean) / std_dev) #Rating mid-sized sentences with higher ratings vector_space['length-score'] = vector_space['length-score'].apply( lambda val: max_val - np.abs(mean - val) / std_dev) #-------Summarization Finalized Results-------- #Calculating the final score for each sentence #Using - tf-score, length-score, position-score, cue-words-score and the paragraph-score vector_space['sentence-score'] = vector_space.apply(
def make_tokenized_matrix_eng(self, texts: List[str]): self.tokenized_matrix = [] print('making tokenized matrix...') for text in tqdm(texts): self.tokenized_matrix.append( [self.lemmatizer(word) for word in word_tokenize(text)])
def _word_tokenize(self, text): return nltk.word_tokenize(text)
def prepText(sentence): tokenized = word_tokenize(sentence) text = nltk.Text(tokenized) return text
#We import extractor function from extract_name.py file. The extractor #function helps us in extracting important features from the report as #mentioned in the first para. import os import re, nltk, psycopg2 import dates, a_name import extract_name from extract_name import extractor d = 0 #In the next statement we open r_fil_date.txt file. It contains the #info about the location of the dates file of various company ids for #e.g. /home/finance/data/600045616/dates happens to be the dates file #of the company with company id 600045616 fo = open("/home/finance/reports2sql/r_fil_date.txt", "rb+") raw = fo.read() #We use nltk.work_tokenize to break our raw data into tokens where each #token is a location of dates file of a company id. locs = nltk.word_tokenize(raw) #We loop here to go through every date file. THen from corresponding #date file we extract the dates on which reports were published. From #the reports we extract the relevant features and put it into our #database ratings1 for t in locs: d = d + extractor(t)
def text_to_tokens(src_file, body_start=0, body_end=-1, chap_pat=r'^\s*Chapter.*$', para_pat=r'\n\n+', sent_pat=r'([.;?!"“”]+)', token_pat=r'([\W_]+)'): # Text to lines lines = open(src_file, 'r', encoding='utf-8').readlines() lines = lines[body_start - 1:body_end + 1] df = pd.DataFrame({'line_str': lines}) df.index.name = 'line_id' del (lines) # FIX CHARACTERS TO IMPROVE TOKENIZATION df.line_str = df.line_str.str.replace('—', ' — ') df.line_str = df.line_str.str.replace('-', ' - ') # Lines to Chapters mask = df.line_str.str.match(chap_pat) df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1) df.chap_id = df.chap_id.ffill().bfill().astype('int') chap_ids = df.chap_id.unique().tolist() df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x)) chaps = df.groupby('chap_num')\ .apply(lambda x: ''.join(x.line_str))\ .to_frame()\ .rename(columns={0:'chap_str'}) del (df) # Chapters to Paragraphs paras = chaps.chap_str.str.split(para_pat, expand=True)\ .stack()\ .to_frame()\ .rename(columns={0:'para_str'}) paras.index.names = PARAS paras.para_str = paras.para_str.str.strip() paras.para_str = paras.para_str.str.replace(r'\n', ' ') paras.para_str = paras.para_str.str.replace(r'\s+', ' ') paras = paras[~paras.para_str.str.match(r'^\s*$')] del (chaps) # Paragraphs to Sentences # sents = paras.para_str.str.split(sent_pat, expand=True)\ sents = paras.para_str\ .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\ .stack()\ .to_frame()\ .rename(columns={0:'sent_str'}) sents.index.names = SENTS del (paras) # Sentences to Tokens # tokens = sents.sent_str.str.split(token_pat, expand=True)\ tokens = sents.sent_str\ .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\ .stack()\ .to_frame()\ .rename(columns={0:'pos_tuple'}) tokens.index.names = OHCO del (sents) tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1]) tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0]) tokens = tokens.drop('pos_tuple', 1) # Tag punctuation and numbers tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int') tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int') # Extract vocab with minimal normalization WORDS = (tokens.punc == 0) & (tokens.num == 0) tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\ .str.replace(r'["_*.]', '') vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\ .reset_index()\ .rename(columns={'index':'term_str', 'term_str':'n'}) vocab = vocab.sort_values('term_str').reset_index(drop=True) vocab.index.name = 'term_id' # Get priors for V vocab['p'] = vocab.n / vocab.n.sum() # Add stems stemmer = nltk.stem.porter.PorterStemmer() vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x)) # Define stopwords sw = pd.DataFrame({'x': 1}, index=nltk.corpus.stopwords.words('english')) vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int') del (sw) # Add term_ids to tokens tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\ .set_index('term_str').term_id).fillna(-1).astype('int') return tokens, vocab