def getFormattingFeatures(obj): question = obj["question_text"].strip() topics = [t["name"] for t in obj["topics"]] tokens = [w for w in wordpunct_tokenize(question) if not re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", w)] punct = [p for p in wordpunct_tokenize(question) if re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", p)] top_toks = set([w.lower() for t in obj["topics"] for w in wordpunct_tokenize(t["name"])]) qn_toks = set(tokens) qn_topic_words = len(top_toks & qn_toks) start_cap = 1 if re.match(r"^[A-Z]", question) else 0 if len(tokens) > 0: qn_type = [1 if sum(1.0 for w in tokens if w in qws) else 0 for qws in qn_type_words] else: # penalize having no token words qn_type = [-1.0] * len(qn_type_words) total_words = len(tokens) correct_form_count = sum(1.0 for w in tokens if (not re.match(r"^[A-Z]+$", w)) or re.match(r"^[A-Z]", w)) topic_word_ratio1 = max(0, qn_topic_words - 2) / float(total_words + 1) topic_word_ratio2 = max(0, 2 - qn_topic_words) / float(total_words + 1) topic_word_ratio = qn_topic_words / float(total_words + 1) punctuation_ratio = len(punct) / float(total_words + 1) word_overshoot = max(0, total_words - 10.1) word_undershoot = max(0, 10.1 - total_words) result = [ start_cap, punctuation_ratio, math.log(len(topics) + 1), topic_word_ratio1, topic_word_ratio2, topic_word_ratio, word_overshoot, word_undershoot, ] + qn_type return result
def getResult(textFile, ind1, ind2, outFile, outFile2): fout = open(outFile,"w") fout2 = open(outFile2, "w") #probs = [] for line in open(textFile): hyp1 = wordpunct_tokenize(line.strip().split("|||")[ind1].strip().decode("utf-8")) hyp2 = wordpunct_tokenize(line.strip().split("|||")[ind2].strip().decode("utf-8")) f = open("temp.txt","w") f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp1])) f.close() os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt") prob1 = getProb("temp_out.txt") f = open("temp.txt","w") f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp2])) f.close() os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt") prob2 = getProb("temp_out.txt") #probs.append([prob1,prob2]) fout.write("%f\t%f\n"%(prob1,prob2)) fout2.write("%f\t%f\n"%(prob1/float(len(hyp1)),prob2/float(len(hyp2)))) fout.close() fout2.close()
def formatting_features(obj): question = obj['question_text'].strip() topics = [ t['name'] for t in obj['topics'] ] tokens = [ w for w in wordpunct_tokenize(question) if not re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',w) ] punct = [ p for p in wordpunct_tokenize(question) if re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',p) ] top_toks = set([ w.lower() for t in obj['topics'] for w in wordpunct_tokenize(t['name']) ]) qn_toks = set(tokens) #qn_topic_words = len(top_toks & qn_toks) qn_mark = 1 if "?" in question else -1 start_cap = 1 if re.match(r'^[A-Z]',question) else -1 if tokens: qn_type = [ sum(1.0 for w in tokens if w in qws) for qws in qn_type_words ] nm_pres = sum(1.0 for w in tokens if w.lower() in names and re.match(r'^[A-Z]',w)) pl_pres = sum(1.0 for w in tokens if w.lower() in places and re.match(r'^[A-Z]',w)) else: qn_type = [0.0]*len(qn_type_words) nm_pres = -1.0 pl_pres = -1.0 # qn_somewhere = 1 if sum(qn_type) and (re.match(r'\?$',question) # or re.match(r'\?\s*[A-Z]',question)) else -1 total_words = len(tokens) dict_words = sum(1 for w in tokens if w.lower() in eng_words) correct_form_count = sum(1.0 for w in tokens if (w.lower() in eng_words and not re.match(r'^[A-Z]+$',w)) or re.match(r'^[A-Z]',w) ) question_form = 1 if '?' in punct and sum(1 for w in tokens if w in qn_words) else -1 correct_form_ratio = correct_form_count/float(total_words+1) #topic_word_ratio = qn_topic_words/float(total_words+1) name_ratio = (nm_pres + pl_pres)/float(total_words+1) punctuation_ratio = len(punct)/float(total_words+1) result = [ # 1 if nm_pres else 0, nm_pres, # 1 if pl_pres else 0, pl_pres, qn_mark, start_cap, # qn_somewhere, correct_form_ratio, #len(punct), punctuation_ratio, math.log(len(topics)+1), #len(topics), name_ratio, # topic_word_ratio, dict_words, # qn_topic_words, # correct_form_count, # math.log(total_words+1), total_words, ] + qn_type return result
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ): print "text_to_sentence" #from nltk.tokenize import wordpunct_tokenize # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # text=text.decode("utf8") from nltk.tokenize import sent_tokenize,wordpunct_tokenize # 1. Use the NLTK tokenizer to split the paragraph into sentences #raw_sentences = tokenizer.tokenize(text.strip()) raw_sentences = sent_tokenize(text.strip()) print "finish tokenize sentence",len(raw_sentences) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: #print "sentence:",raw_sentence # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words #sentences.append( text_to_wordlist( raw_sentence, \ # remove_stopwords )) #print removePunctuation(raw_sentence).lower().split() print raw_sentence sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split()) print wordpunct_tokenize(raw_sentence) #print text_to_wordlist( raw_sentence, remove_stopwords ) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def check_len_stats(std_dev): fraction = 0 for i in range(1,5): fraction+=0.25 count1 = 0 count2 = 0 mcount = 0 ncount = 0 threshold = fraction*std_dev print threshold with open(infile, 'r') as f: for line in f: mem_len = 0 nonmem_len= 0 if(line.strip().split('\t')[1]=='M'): mem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0])) mcount +=1 if (float(mem_len) < threshold): count1+=1 else: nonmem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0])) ncount+=1 if (float(nonmem_len) < threshold): count2+=1 f.close() print "iteration-" , i print "memorable quotes below threshold-", count1 print "total memorable quotes-",mcount print "non-memorable quotes below threshold-",count2 print "non memorable quotes-",ncount
def dice_sentence(sentence1, sentence2): """ Determines the Dice value of two sentences :param sentence1: :param sentence2: :return: dice value """ return dice(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
def common_words(sent1,sent2): # remove stop words, lemmatise and return count of common words porter = PorterStemmer() #stop = stopwords.words('english') s1_words = [porter.stem(i.lower()) for i in wordpunct_tokenize(sent1) ] s2_words = [porter.stem(i.lower()) for i in wordpunct_tokenize(sent2) ] s1 = set(s1_words) s2 = set(s2_words) return len(s1.intersection(s2)) / ((len(s1)+0.1+len(s2))/2.0) # normalised
def jaccard_sentence(sentence1, sentence2): """ Determines jaccard value of two sentences :param sentence1: :param sentence2: :return: jaccard value """ return jaccard(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
def load_memes (self, filenames): for filename in filenames: f = open(filename, 'r') contents = f.readlines() for entry in contents: fields = [s.strip() for s in entry.split("|")] meme_type = fields[0] top_text = wordpunct_tokenize(fields[1].lower()) bottom_text = wordpunct_tokenize(fields[2].lower()) self.memes[meme_type].append ((top_text, bottom_text))
def generate_vocabulary(self, review_summary_file): self.rev_sum_pair = pd.read_csv(review_summary_file,header=0).values for review,summary in self.rev_sum_pair: rev_lst = wordpunct_tokenize(review) sum_lst = wordpunct_tokenize(summary) self.__add_list_to_dict(rev_lst) self.__add_list_to_dict(sum_lst) # Now store the "" empty string as the last word of the voacabulary self.map[""] = len(self.map) self.revmap[len(self.map)] = ""
def med_sentence(sentence1, sentence2, c1=1, c2=1, c3=1): """ Determines minimum edit distance of two sentences. :param sentence1: first sentence :param sentence2: second sentence :param c1: optional weight :param c2: optional weight :param c3: optional weight :return: integer, minimum edit distance """ return med(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2), c1, c2, c3)
def features_from_dump(infile,variant,embeddings,bowfilter): frame = read_dump(infile) refstatements = [wordpunct_tokenize(st) for st in list(frame.Ref)] targetstatements = [wordpunct_tokenize(st) for st in list(frame.Target)] featuredicts = [] for i in range(len(refstatements)): sp = StatementPair(i, refstatements[i], targetstatements[i], 0) commonwords, onlyref, onlytarget = sp._word_venn_diagram() trainingbow.update(onlyref) featuredicts.append(sp.featurize(variant, embeddings,bowfilter)) return featuredicts
def main(): # related_words = { # 'art':['art', 'arts', , 'op art', 'pop art', 'art deco', 'art form', 'art house', 'art-house', 'clip art', 'fine art', 'art gallery', 'art nouveau', 'art therapy', 'kinetic art', 'martial art', 'art director', 'conceptual art', "objet d'art", 'performance art', 'work of art', 'state-of-the-art', 'the black art', 'thou art', 'noble art', 'craft', 'craftsmanship', 'ingenuity', 'mastery', 'artistry', 'imagination', 'Biedermeier', 'Parian', 'Queen Anne', 'annulate', 'anomphalous', 'banded', 'chryselephantine', 'aperture', 'collared', 'artificial', 'condensed', 'camera', 'copied'], # 'sport':['athletcis', 'recreation', 'candidacy', 'championship', 'clash', 'contention', 'event', 'fight', 'game', 'match', 'race', 'rivalry', 'run', 'sport', 'sports', 'struggle', 'tournament', 'trial', 'basketball', 'football', 'soccer', 'badminton', 'archery', 'tennis', 'swim'] # } result = dict() clubs = list(Club.objects.all()) print len(clubs) for club in clubs: score = 0 # try: if club.introduction: intro = club.introduction else: intro = "" name = club.name max_score = 0 max_cat = None for category in CATEGORIES: all_words = wordpunct_tokenize(intro.lower()) all_name_words = wordpunct_tokenize(name.lower()) score = 0 for word in determinstic_words[category]: score += all_words.count(word) * 2 score += all_name_words.count(word) * 10 if score > max_score: max_cat = category max_score = score if max_cat and max_score > 2: category = Category.objects.get(name=max_cat) club.categories.add(category) club.save() try: # print name, max_cat, max_score result[max_cat].append(name) except KeyError: result[max_cat] = [name] for category in CATEGORIES: print category try: for club in result[category]: print club except: pass print "\n"
def hypernym_count(sent1,sent2): s1_words = [i.lower() for i in wordpunct_tokenize(sent1) ] s2_words = [i.lower() for i in wordpunct_tokenize(sent2) ] s1_all = [] s2_all = [] for w in s1_words: s1_all.extend(get_hypernyms(w)) for w in s2_words: s2_all.extend(get_hypernyms(w)) w1_hypernym = len(set(s1_words).intersection(set(s2_all))) w2_hypernym = len(set(s2_words).intersection(set(s1_all))) return w1_hypernym-w2_hypernym
def frequencies(sentence_texts, stopword = False): #lower case out = sentence_texts.lower() #remove punctuation out = out.translate(string.maketrans("",""), string.punctuation) #tokenize out = wordpunct_tokenize(out) #build Dictionary of key=word value=number of occurances frequencies = {} for word in out: if word not in frequencies: #if word is a stopword and stopword is on, do not add if not(stopword == True and word in stopwords.words('english')): frequencies[word] = 1 else: frequencies[word] += 1 #sort frequencies sorted_frequencies = sorted(frequencies.iteritems(), key=operator.itemgetter(1), reverse=True) #output largest frequency first return sorted_frequencies
def perplexity(f_cost, lines, worddict, options, verbose=False, wv_embs=None): n_lines = len(lines) cost = 0. n_words = 0. for i, line in enumerate(lines): # get array from line wordin = wordpunct_tokenize(line.strip()) seq = [worddict[w] if w in worddict else 1 for w in wordin] seq = [s if s < options['n_words'] else 1 for s in seq] n_words += len(seq)+1 x = numpy.array(seq+[0]).astype('int64').reshape([len(seq)+1,1]) x_mask = numpy.ones((len(seq)+1,1)).astype('float32') if options['use_preemb']: shp = x.shape xi = wv_embs[x.flatten()].reshape([shp[0], shp[1], wv_embs.shape[1]]) else: xi = x cost_one = f_cost(x, x_mask, xi, x_mask) * (len(seq)+1) cost += cost_one if verbose: print 'Sentence ', i, '/', n_lines, ' (', seq.mean(), '):', 2 ** (cost_one/len(seq)/numpy.log(2)), ', ', cost_one/len(seq) cost = cost / n_words return cost
def calcFreq(self,cb,i): wordFreq = dict() path = "/home/mis/file_" + str(i) + ".txt" conteudo = self.reader.readerFile(path) print (":: Retirando Pontuação -> %s" % time.strftime("%d/%m/%Y %H:%M:%S")) make = str.maketrans(string.punctuation,' ') conteudoLimpo = conteudo.translate(make) del conteudo print (":: Retirando espaço duplicados -> %s" % time.strftime("%d/%m/%Y %H:%M:%S")) conteudo = conteudoLimpo.strip() conteudoLimpo = re.sub(' +',' ',conteudoLimpo) print (":: Token -> %s" % time.strftime("%d/%m/%Y %H:%M:%S")) palavras = tokenize.wordpunct_tokenize(conteudoLimpo) print (":: Frequencia -> %s" % time.strftime("%d/%m/%Y %H:%M:%S")) frequencias = nltk.FreqDist(palavras) print (":: Monta WordFreq -> %s" % time.strftime("%d/%m/%Y %H:%M:%S")) for key in frequencias.keys(): wordFreq[key.strip().lower()] = frequencias[key] self.palavras = self.palavras + palavras del conteudo del palavras del frequencias return wordFreq
def split_sentence_from_document(document): max_counts = 0 for sent in tokenize.sent_tokenize(document): max_counts = max(max_counts, len(tokenize.wordpunct_tokenize(sent))) # if max_counts>4000: # print(document) return max_counts
def tokenize(directory,exclude_files): full_content = '' for _file in os.listdir(directory): #disp_count = 5 if exclude_files and (_file in exclude_files): continue with open(directory+_file,'r') as f: contents = f.readlines() for item in contents: try: sentence = item.split('\t')[1].strip() full_content += sentence except IndexError: continue # if np.random.binomial(1,0.1): # print sentence # time.sleep(2) # disp_count -=1 # if not disp_count: # print '*'*100 # break # else: # print '#' return wordpunct_tokenize(full_content.lower())
def preprocess(line, is_lmz=False): line = wordpunct_tokenize(line.strip()) if is_lmz: lemmatizer = WordNetLemmatizer() line = [lemmatizer.lemmatize(word) for word in line] return line
def __init__(self, project_dict): self.pid = project_dict['id'] self.blurb = project_dict['blurb'].lower() self.deadline = project_dict['deadline'] self.category_id = project_dict['category']['id'] self.category_desc = re.sub('/.*', '', project_dict['category']['slug']) self.reward_backer_tup = project_dict['reward_backer_tup'] self.text = project_dict['full_description'].lower() + " " + project_dict['risk'].lower() self.tokens = np.array(wordpunct_tokenize(self.text)) self.name = project_dict['name'] self.url = project_dict['url'] self.launched_at = project_dict['launched_at'] self.pledged = project_dict['pledged'] self.title = project_dict['title'] self.no_dollars_raised = project_dict['no_dollars_raised'] self.currency = project_dict['currency'] self.no_backers = project_dict['no_backers'] self.state = project_dict['state'] self.deadline = project_dict['deadline'] self.location = project_dict['location'] self.backers_count = project_dict['backers_count'] self.creator_url = project_dict['creator_url'] self.backers_count = project_dict['backers_count'] self.spotlight = project_dict['spotlight'] self.goal = project_dict['goal'] self.author = project_dict['author']
def json_converter_ifn_body(file_path): """ Raw into json converter for IFN Read and construct tokens list from test file. ARGS: file_path(file which you want to classify) """ #queryファイルの読み込み line_flag=False; motif_flag=False; motif_stack=[]; line_stack=[]; with codecs.open(file_path, 'r', 'utf-8') as lines: for line in lines: if line==u'\n': continue; if line==u'#motif\n': motif_flag=True; continue; elif line==u'#text\n': motif_flag=False; line_flag=True; continue; if motif_flag==True and line_flag==False: motif_stack.append(line.strip()); if line_flag==True and motif_flag==False: line_stack.append(line.strip()); tokens_stack=[tokenize.wordpunct_tokenize(line) for line in line_stack] tokens_stack=[[t.lower() for t in l] for l in tokens_stack] #ここではstopwordsの除去はしない #if eliminate_stop==True: # tokens_stack=[[t for t in l if t not in stopwords and t not in symbols] for l in tokens_stack] #配列を二次元から一次元に落とす.ついでにlemmatizeも行う. #tokens_stack=[lemmatizer.lemmatize(t) for line in tokens_stack for t in line]; return tokens_stack, motif_stack;
def pick_top(number, sortedLst, ratio): unigrams = [] bigramsplus = [] for element in sortedLst: tokens = wordpunct_tokenize(element[0]) if len(tokens) is 1: unigrams.append(element) else: bigramsplus.append(element) #will be a list of the top *number* strings topList = [] unigramIndex = 0 bigramIndex = 0 while len(topList) < number: if unigramIndex is len(unigrams): if bigramIndex is len(bigramsplus): break else: topList.append(bigramsplus[bigramIndex][0]) bigramIndex += 1 elif bigramIndex is len(bigramsplus): topList.append(unigrams[unigramIndex][0]) unigramIndex += 1 else: if unigrams[unigramIndex][1] * ratio < bigramsplus[bigramIndex][1]: topList.append(bigramsplus[bigramIndex][0]) bigramIndex += 1 else: topList.append(unigrams[unigramIndex][0]) unigramIndex += 1 return topList
def tokenStem(words): words = words.strip('[').strip(']').lower() #remove brackets and lowercase words = re.sub('[(){}<>:,.!?\'"]', '', words) stemmer = PorterStemmer() stops = stopwords.words('english') output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words return " ".join(output) #merge into strings
def tokenizeNoPunctuation(tweets): tokens = [] stoplist = [',', '(', ')', '.', '?', '/', '+', ':', ';'] for tweet in tweets: tokenized = wordpunct_tokenize(tweet) tokens.append([token for token in tokenized if token not in stoplist]) return tokens
def best_dressed(year): if year not in yearMap.keys(): prep_year(year) strings = yearMap[year]['strings'] dressPattern = re.compile(r'(dress)|(red carpet)|(redcarpet)', re.IGNORECASE) posPattern = re.compile(r'(best)|(beautiful)|(stun)|(love)', re.IGNORECASE) negPattern = re.compile(r'(worst)|(bad)|(ugly)|(hate)', re.IGNORECASE) namePattern = re.compile(r'[A-Z]\w* [A-Z]\w*') stoplist = ['new','red','carpet','redcarpet','globes','golden','best','worst','movie','motion','picture','film','drama','comedy','musical','cecil','demille','award','tv','performance', 'actress','actor','television','feature','foreign','language','supporting','role','director','original','series'] dress_mentions = Counter() dress_mentions_neg = Counter() dress_mentions_pos = Counter() for tweet in strings: if re.search(dressPattern, tweet): matches = re.findall(namePattern, tweet) matches = (w.lower() for w in matches) for match in matches: match_words = wordpunct_tokenize(match) if match_words[0] not in stoplist and match_words[1] not in stoplist: dress_mentions[match] += 1 if re.search(posPattern, tweet): dress_mentions_pos[match] += 1 if re.search(negPattern, tweet): dress_mentions_neg[match] += 1 discussed_dress = dress_mentions.most_common(1) best_dress = dress_mentions_pos.most_common(1) worst_dress = dress_mentions_neg.most_common(1) return best_dress[0][0], worst_dress[0][0], discussed_dress[0][0]
def test_small_talk_filter(_bot_brain): bot, pos, lex = _bot_brain tester = wordpunct_tokenize("raining snowing sunny weather") weather_opts = ["Talking about the weather is such a bore.", "I'm not the weatherman!"] sentence = input_filters.filter_small_talk(tester) assert sentence in weather_opts
def test_filter_length_words(): tester = wordpunct_tokenize("I am not happy but I am not hungry either.") seeds = input_filters.filter_length_words(tester) assert "I" not in seeds assert "hungry" in seeds assert "happy" in seeds assert "am" not in seeds
def PredictReviewScore(self, sentences, label=0): """ This method gives a score to a review. """ AdjR = 0.0 # if text.startswith("For more photos and reviews do check out fourleggedfoodies"): # x = 1 adjAll = [] for sentence in sentences: adjectives, dependencies = self.ExtractSentDetails(sentence) adjAll.extend(adjectives) allAdjectives = adjectives | Angel.GlobalAdjList AdjS = 0.0 words = wordpunct_tokenize(sentence["Text"]) if len(words) <= 3: allAdjectives |= set([x.lower() for x in words]) for i in range(len(words)): word = words[i].lower() if word in {"but", "if"}: AdjS = 0.0 elif word in allAdjectives and word in self.lexicon: AdjS += float(self.lexicon[word]) * self.PredictMultiplier(word, dependencies[word], words, i) AdjR += AdjS AdjR *= self.PredictBase(adjAll) finalScore = AdjR if self.DumpRequested(finalScore, label): self.DumpDetails(sentences, label) return finalScore
def Match(self, text): #tokeniz and normalize our text textArr = tokenize.wordpunct_tokenize(text.lower().strip()) hits = 0 results = [] secondary = [] #-tlength as we need to iterate over window size of words for ti in xrange(0,len(textArr)-self.tlength): for termT in self.toMatch: #so whats the distance between our first token? dist1 = editdist.distance(textArr[ti],termT[hits]) if dist1 <= self.thresh: if len(termT) <= 1: print "got hit with %s"%termT results.append(termT[hits]) else: dist2 = editdist.distance(textArr[ti+1],termT[hits+1]) print "distance between %s and %s is %s" %(textArr[ti+1],termT[hits+1],dist2) #WARNING: this will only work for 2-grams where the tlength is an n-gram. if dist2 <= self.thresh: #we have a close hit lets check if the second term in tuple is a hit as well. #hits = hits + 1 results.append("%s %s"%(termT[hits],termT[hits+1])) #print termT #print "got hit on term %s"%results #looks like we've found a match #print secondary #we're done shit.... return results
def summarize(self): self.sentences = sent_tokenize(self.text) self.tokenizedSentences = [] for sentence in self.sentences: self.tokenizedSentences.append(Counter([word for word in wordpunct_tokenize(sentence) if word not in self.Puncts])) self.b_matrix = self.CV.fit_transform(self.sentences) self.n_matrix = TfidfTransformer().fit_transform(self.b_matrix) self.sim_graph = self.n_matrix * self.n_matrix.T; self.sen_graph = nx.from_scipy_sparse_matrix(self.sim_graph) self.sen_scores = nx.pagerank(self.sen_graph) self.sorted_sentences_1 = sorted(self.sentences, key = lambda s: self.sen_scores[self.sentences.index(s)], reverse=True) self.sorted_sentences_2 = sorted(self.sorted_sentences_1[:5], key = lambda s: self.sentences.index(s))
def featurize(self, input_str): input_str = gensim.utils.to_utf8(input_str, errors='replace').decode("utf8") doc = wordpunct_tokenize(input_str) doc = [w.lower() for w in doc] # Convert from tokens to word ids from the model dictionary. doc_bow = self.dict.doc2bow(doc) # Simply add up all the vectors and return. vec = self.model[doc_bow] col = [] data = [] for topicNum, value in vec: data.append(value) col.append(topicNum) row = [0 for _ in range(len(data))] vec = coo_matrix((data, (row, col)), shape=(1, self.model.num_topics)).toarray() return vec
def get_msg_words(msg, stopwords=[], strip_html=False): """get msg workds""" msg = re.sub('3D', '', msg) if strip_html: msg = re.sub('<(.|\n)*?>', ' ', msg) msg = re.sub('&\w+;', ' ', msg) msg = re.sub('_+', '_', msg) msg_words = set(wordpunct_tokenize(msg.replace('=\n', '').lower())) # Get rid of stopwords msg_words = msg_words.difference(stopwords) # Get rid of punctuation tokens, numbers, and single letters. msg_words = [ w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1 ] return msg_words
def main(): input_text = 'We will discuss briefly about the basic syntax,\ structure and design philosophies. \ There is a defined hierarchical syntax for Python code which you should remember \ when writing code! Python is a really powerful programming language!' #synsets = wn.synsets('phone') #print [str(syns.definition()) for syns in synsets] synsets = wn.synsets('philosophies') for syns in synsets: print 'philosophies', '==>', syns.definition() synsets = wn.synsets('Python') for syns in synsets: print 'Python', '==>', syns.definition() op = word_tokenize(input_text) print '\nTokenize output', op print '\nStemming output' for e in op: if len(e) > 1: porter_stemmer = PorterStemmer() print porter_stemmer.stem(e) print '\nPOS', pos_tag(op) print('\nLemmatize output') lm = WordNetLemmatizer() for e in op: if len(e) > 1: print lm.lemmatize(e) print '\nTrigram output' trigrams = ngrams(op, 3) for grams in trigrams: print grams print '\nNamed Entity Recognization' print ne_chunk(pos_tag(wordpunct_tokenize(input_text)))
def simple_neg_pos_wc(df, column_name, sum=False): neg_words = pd.read_csv('negative-words.txt', skiprows=36, header=None, encoding='ISO-8859-1') pos_words = pd.read_csv('positive-words.txt', skiprows=36, header=None, encoding='ISO-8859-1') stop_words = set( stopwords.words('english') ) # these are nonsense words that don't belong in the wordcloud ('a','the' etc.) stemmer = SnowballStemmer( "english" ) # this stemmer will clip the end of words so that begins and begin etc. look the same stop_words.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # add some characters to the stopwords # for every entry in the column, get the list of words and stem them. remove stop words word_list = [ stemmer.stem(i.lower()) for i in wordpunct_tokenize(" ".join(df[column_name].dropna())) if i.lower() not in stop_words ] # correct any spelling mistakes introduced by the stemmer word_list = [spell(i) for i in word_list] pos_count = 0 neg_count = 0 for word in word_list: if (neg_words.loc[:, 0] == word).sum() > 0: neg_count -= 1 if (pos_words.loc[:, 0] == word).sum() > 0: pos_count += 1 if sum: return neg_count + pos_count else: return neg_count, pos_count, len(word_list)
def clean_text_simple(text, my_stopwords, punct, remove_stopwords=True, pos_filtering=True, stemming=True): text = text.lower() text = ''.join(l for l in text if l not in punct) # remove punctuation (preserving intra-word dashes) text = re.sub(' +',' ',text) # strip extra white space text = text.strip() # strip leading and trailing white space # tokenize (split based on whitespace) ### fill the gap (store results as 'tokens') ### tokens = wordpunct_tokenize(text) if pos_filtering == True: # POS tag and retain only nouns and adjectives tagged_tokens = pos_tag(tokens) tokens_keep = [] for item in tagged_tokens: if ( item[1] == 'NN' or item[1] == 'NNS' or item[1] == 'NNP' or item[1] == 'NNPS' or item[1] == 'JJ' or item[1] == 'JJS' or item[1] == 'JJR' ): tokens_keep.append(item[0]) tokens = tokens_keep if remove_stopwords: # remove stopwords from 'tokens' ### fill the gap ### filtered_list = [w for w in tokens if not w in my_stopwords] tokens = filtered_list if stemming: # apply Porter's stemmer stemmer = PorterStemmer() tokens_stemmed = list() for token in tokens: tokens_stemmed.append(stemmer.stem(token)) tokens = tokens_stemmed return(tokens)
def get_msg_words(msg, stopwords=[], strip_html=False): ''' Returns the set of unique words contained in an e-mail message. Excludes any that are in an optionally-provided list. NLTK's 'wordpunct' tokenizer is used, and this will break contractions. For example, don't -> (don, ', t). Therefore, it's advisable to supply a stopwords list that includes contraction parts, like 'don' and 't'. ''' # Strip out weird '3D' artefacts. msg = re.sub('3D', '', msg) # Strip out html tags and attributes and html character codes, # like and <. if strip_html: msg = re.sub('<(.|\n)*?>', ' ', msg) msg = re.sub('&\w+;', ' ', msg) # wordpunct_tokenize doesn't split on underscores. We don't # want to strip them, since the token first_name may be informative # moreso than 'first' and 'name' apart. But there are tokens with long # underscore strings (e.g. 'name_________'). We'll just replace the # multiple underscores with a single one, since 'name_____' is probably # not distinct from 'name___' or 'name_' in identifying spam. msg = re.sub('_+', '_', msg) # Note, remove '=' symbols before tokenizing, since these are # sometimes occur within words to indicate, e.g., line-wrapping. msg_words = set(wordpunct_tokenize(msg.replace('=\n', '').lower())) # Get rid of stopwords msg_words = msg_words.difference(stopwords) # Get rid of punctuation tokens, numbers, and single letters. msg_words = [ w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1 ] return msg_words
def detect_num(sentence): new_sentence = '' tokens = wordpunct_tokenize(sentence) ordinal_num = ['st', 'nd', 'rd', 'th'] for i in tokens: if i.isalpha() == False: if i[-2:] in ordinal_num: new_sentence = new_sentence + num2words( int(i[:-2]), to='ordinal').replace('-', ' ') + ' ' #print('ordinal num\n', new_sentence) elif len(i) == 4: if '0' == i[1] and i[2] != '0': new_sentence = new_sentence + num2words(int(i)).replace( ' and ', ' ') + ' ' else: new_sentence = new_sentence + num2words( int(i), to='year').replace('-', ' ') + ' ' elif 's' in i: new_sentence = new_sentence + num2words(int( i[:-1]), to='year').replace('-', ' ') + ' ' #print('year\n', new_sentence) elif i.isdigit(): new_sentence = new_sentence + num2words(int(i)).replace( '-', ' ') + ' ' else: word = '' for char in range(0, len(i)): if i[char].isalpha( ) or i[char] == ':' or i[char] == '[' or i[char] == ']': word = word + i[char] else: word = word + num2words(int(i[char])) new_sentence = new_sentence + word + ' ' #print('default\n',new_sentence) else: new_sentence = new_sentence + i + ' ' new_sentence = new_sentence.strip(' ') return new_sentence
def create_naive_bayes_model_pickle(self): feature_set = [] print "Feature set creating. !" for i in range(1,4,1): file_name = "data_set"+str(i)+".txt" file = open(self.data_set_dir+file_name,'r') j=0 for line in file: print "txt%s%s" %( i,j) j+=1 words = wordpunct_tokenize(line) if words[len(words)-1] == "1" : label = "pos" reverse_label = "neg" elif words[len(words)-1] == "0" : label = "neg" reverse_label = "neg" else: label = "neu" reverse_label = "neu" del words[len(words)-1] words = self.clean_words(words) feature_set.append((self.create_feature_set(words)[0],label)) feature_set.append((self.create_feature_set(words)[1],reverse_label)) print "Feature set created. !" random.shuffle(feature_set) feature_set += self.twitter_data_training() # feature_set += self.moviereview_data_training() training_set = feature_set[:14500] testing_set = feature_set[14500:] testing_set = training_set; print "Training..." classifier = nltk.classify.NaiveBayesClassifier.train(training_set) file = open(self.naive_base_model, 'wb') pickle.dump(classifier, file) file.close() print "Accuracy:" + str(nltk.classify.accuracy(classifier, testing_set))
def analyze_composite(self, sentence, weightedLexicon): if weightedLexicon: lexicon = self.weightedLexicon else: lexicon = self.unweightedLexicon tweetScore = 0 words = wordpunct_tokenize(sentence) for index, word in enumerate(words): term = word if len(words[index:]) < 6: maxim = index + len(words[index:]) - 1 else: maxim = index + 5 for i in range(index + 1, maxim + 1): term = term + " " + words[i] #print term + ":" + str(self.lookUpWordScore(term, lexicon)) tweetScore = tweetScore + self.lookUpWordScore( term, lexicon, False) return tweetScore
def neg_pos_inv(df, column_name): neg_words = pd.read_csv('negative-words.txt', skiprows=36, header=None, encoding='ISO-8859-1') pos_words = pd.read_csv('positive-words.txt', skiprows=36, header=None, encoding='ISO-8859-1') inv_words = ['not', 'lack of', 'only', 'can\'t', 'no', 'more'] #stop_words = set(stopwords.words('english')) # these are nonsense words stop_words = ([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'enhanced' ]) new_df = df[column_name].dropna().reset_index() new_df['sentiment'] = 0 negation = False prev = None pprev = None for i in range(len(new_df)): word_list = [ i.lower() for i in wordpunct_tokenize("".join( new_df['prison_service_facilities_other_thoughts'][i])) if i.lower() not in stop_words ] for word in word_list: if ((pos_words.loc[:, 0] == word).sum() > 0) and (prev not in inv_words): new_df['sentiment'][i] += 1 if ((pos_words.loc[:, 0] == word).sum() > 0) and (prev in inv_words or pprev in inv_words): new_df['sentiment'][i] -= 1 if (neg_words.loc[:, 0] == word).sum() > 0: new_df['sentiment'][i] -= 1 prev = word ppev = prev return new_df['sentiment'].sum()
def featurize(self, input_str, num_best=None, use_reverse_index=True): """ Returns similar documents by cosine similarity based on TF-IDF score. If num_best is left as None, returns a numpy.array with a score for every document in the corpus. Otherwise, it returns the top-K scored items as a list of (doc_idx, score) tuples. If use_reverse_index is set to False, the forward index is used (and the full corpus is queried). This is only a good idea when the number of terms in the input string is big, such as the text of a long article. For short documents, using the reverse index is usually much faster. """ logger.debug("input string: %s", input_str) # Tokenize the input string. input_str = utils.to_utf8(input_str, errors='replace').decode("utf8") doc = wordpunct_tokenize(input_str) doc = [w.lower() for w in doc] # Convert from tokens to word ids from the model dictionary. doc_bow = self.dictionary.doc2bow(doc) # Get TF-IDF score for the document words (this does not update the TF-IDF model itself). doc_tfidf = self.tfidf[doc_bow] # Calculate similarity scores. self.similarity_index.use_reverse_index = use_reverse_index similar_docs = self.similarity_index[doc_tfidf] # Fall back to self.num_best if it wasn't specified here. if num_best is None: num_best = self.num_best if num_best is None: return similar_docs # Return top-k if requested. return heapq.nlargest(num_best, enumerate(similar_docs), key=lambda item: item[1])
def extract_tag_n_grams_and_add_to_dict(data_dir): listing = os.listdir(data_dir) list = [] for filename in listing: with open(data_dir + filename, 'r') as f: tag_list_for_line = [] for line in f: if (':' in line): line_list = wordpunct_tokenize(line) tag = line_list[-1] tag_list_for_line.append(tag) else: if (len(tag_list_for_line) > 0): add_to_dict_n_gram_tags(dict_unigram, tag_list_for_line) bigrams = ngrams(tag_list_for_line, 2) add_to_dict_n_gram_tags(dict_bigram, bigrams) tag_list_for_line = [] list.append(dict_unigram) list.append(dict_bigram) return list
def json_to_conll(corpus_json_location, output_location, by_sent = False): with codecs.open(corpus_json_location, encoding='utf-8') as in_file: reviews = map(json.loads, in_file.readlines()) with codecs.open(output_location, 'w', encoding='utf-8') as out_file: for review in reviews: documents = sent_tokenize(review['text']) if by_sent else [review['text']] w_start = 0 w_end = 0 for document in documents: tokens = wordpunct_tokenize(document) corrected_tokens = map(correct, tokens) pos_tags = tagger.tag(corrected_tokens) for token, temp in zip(tokens, pos_tags): token_corr = temp[0] pos_tag = temp[1] w_start, w_end, delimitter = get_token_position_in_text(token, w_start, review['text']) bio_tag = get_bio_tag(w_start, w_end, review['entities']) lemm = lemmatizer.lemmatize(token_corr, get_wordnet_pos(pos_tag)) out_file.write(u'{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(token, lemm, pos_tag, bio_tag, w_start, w_end, delimitter, review['id'])) w_start = w_end - 1 out_file.write('\n')
def get_keywords(self, document): ''' groups keywords which are separated by stop words and punctuation ''' sentences = sent_tokenize(document) candidate_keywords = [] for i, sentence in enumerate(sentences): curr_keyword = [] tokens = wordpunct_tokenize(sentence) # normalize case and remove punctuation words = [w.lower() for w in tokens if w.isalnum()] for word in words: if word not in self.stopwords: curr_keyword.append(word) else: if curr_keyword != []: candidate_keywords.append({ 'keyword_list': curr_keyword, 'sentence_num': i, }) curr_keyword = [] return candidate_keywords
def normalize_doc(data): for i in range(1, len(data)): temp = [] temp2 = wordpunct_tokenize(data[i]) l = 0.0 for word in temp2: if word not in temp: temp.append(word) for word in temp: if word in tf_data: l = l + pow(tf_data[word], 2) l = sqrt(l) for word in temp: if word in tf_data: tf_data[word] /= l ''' tf-idf score ''' for key in tf_data.keys(): tf_idf_data[key] = tf_data[key] * idf_data[key]
def get_instagram_caption_terms(hashtag): search_url = 'https://www.instagram.com/explore/tags/' + hashtag + '/?__a=1' contents = urllib2.urlopen(search_url).read() results = json.loads(contents) edges = results['graphql']['hashtag']['edge_hashtag_to_media']['edges'] for edge in edges: captions = edge['node']['edge_media_to_caption']['edges'] for caption in captions: text = caption['node']['text'] words = [ i.lower() for i in wordpunct_tokenize(text.encode('ascii', 'ignore')) if i.lower() not in stop_words ] all_tokens = ' '.join(words) texts = words # remove words that appear only once tokens_once = set(words for words in set(all_tokens) if all_tokens.count(words) == 1) texts = [[words for words in texts if words not in tokens_once] for words in all_tokens] return texts
def pre_process_data(self): word_dict = dict() data_matrix = dict() word_index = 0 pos_files = gl.glob('pos\*.txt') neg_files = gl.glob('neg\*.txt') self.pos_files = pos_files #pos_files = pos_files[0:100] # 2000 #neg_files = neg_files[0:100] # 2000 pos_files.extend(neg_files) self.all_files = pos_files for fl in pos_files: review_text = self.remove_stop_words(fl) review_tokenized = wordpunct_tokenize(review_text) for word in review_tokenized: if word not in word_dict: word_dict[word] = word_index word_index = word_index + 1 if (word, fl) not in data_matrix.keys(): data_matrix[(word, fl)] = 1 else: data_matrix[(word, fl)] = data_matrix[(word, fl)] + 1 self.data_matrix = dok_matrix((len(word_dict.keys()), len(pos_files))) for word, fl in data_matrix.keys(): word_index = word_dict[word] doc_index = pos_files.index(fl) self.data_matrix[word_index, doc_index] = data_matrix[(word, fl)] savemat('dm.mat', mdict={'arr': self.data_matrix}) self.vocabulary = list(word_dict.keys()) self.data_matrix = self.data_matrix.transpose()
def unigramPerplexity(): global filename, totalLines, tokens, index with open(filename) as file: perplexities = [] for line in file: listOfWords = wordpunct_tokenize(line) l = len(listOfWords) prob = [] for i in range(l): word = listOfWords[i] prob.append(wordDict[word][1] / float(tokens)) per = 1 for p in prob: per = per * p if per != 0: per = 1 / float(per) perplexities.append(pow(per, 1 / float(l))) PP = 0 for i in perplexities: PP = PP + i PP = PP / float(len(perplexities)) return PP
def tokenize_short_text(self, raw_tweet_text): tweet_text = raw_tweet_text #tweet_text = tweet_text.strip() #tweet_text = unidecode.unidecode(tweet_text) if self.args.use_lowercase: tweet_text = tweet_text.lower() if self.tokenizer > 0: if self.tokenizer == 1: uttterance_tokens = word_tokenize(tweet_text) if self.tokenizer == 2: uttterance_tokens = wordpunct_tokenize(tweet_text) if self.tokenizer == 3: uttterance_tokens = self.tweet_tokenizer.tokenize(tweet_text) if self.tokenizer == 4: tweet_text = clean(tweet_text) tweet_text = self.remove_accented_chars(tweet_text) uttterance_tokens = self.tweetokenizer.tokenize(tweet_text) uttterance_tokens = self.remove_duplicated_sequential_words(uttterance_tokens) uttterance_tokens = self.remove_stopwords(uttterance_tokens) if self.tokenizer == 5: tweet_text = tokenize(' '.join(self.tweet_tokenizer.tokenize(tweet_text))) return tweet_text if self.tokenizer == 6: tweet_text = clean(' '.join(self.tweet_tokenizer.tokenize(tweet_text))) return tweet_text if self.stem: uttterance_tokens = [list(map(self.stemmer.stem, sub)) for sub in uttterance_tokens] if self.lemmatize: uttterance_tokens = [[self.lemmatizer.lemmatize(tok, pos='v') for tok in sub] for sub in uttterance_tokens] tweet_text = " ".join(uttterance_tokens) return tweet_text
def normalize_hospital_name(name): """Normalizes a given hospital name. 1. Converts all words to lower case. 2. Removes all stopwords. Args: name -- Name to be normalized. Returns: slug -- The normalized hospital name. """ normalized_name = name.lower() stopword_list = stopwords.words('english') filtered_words = [ word for word in wordpunct_tokenize(normalized_name) if word not in stopword_list ] slug = slugify(' '.join(filtered_words)) return slug
def createBigram(): global filename, totalLines, tokens, index with open(filename) as file: for line in file: listOfWords = wordpunct_tokenize(line) l = len(listOfWords) if l != 0: word = listOfWords[0] key = str(["", word]) if key not in matrix: matrix[key] = 1 else: matrix[key] += 1 # matrix[V][wordDict[word][0]]+=1 for i in range(l - 1): word = listOfWords[i] next_word = listOfWords[i + 1] key = str([word, next_word]) if key not in matrix: matrix[key] = 1 else: matrix[key] += 1
def add_subword(sentence, subword_dict=None, additional_dict=None, dropout=1): """ this will make an n-gram of the words in the sentence. you can easily add stuff by using additional_dict. Any if the word contians the key the value (must be int) will be +1. dropout is 1 by default if your training you can change this to accomodate regularization the value of dropout is the percentage to keep """ if subword_dict is None: subword_dict = { abc: i for i, abc in enumerate('abcdefghijklmnopqrstuvwxyz0123456789?') } if additional_dict is None: additional_dict = {} # examples additional_dict["en"] = 38 additional_dict["sch"] = 39 for key in additional_dict: value = additional_dict[key] assert type(value) == int subword_dict[key] = value max_value = find_max_sub_dict_value(subword_dict) sentence = [word for word in wordpunct_tokenize(sentence)] subword = np.zeros((len(sentence), max_value)) for word_num, word in enumerate(sentence): for key in subword_dict: if word.__contains__(key): if np.random.rand() > dropout: subword[word_num, subword_dict[key]] += 1 return subword
def tokenize_and_stem(text, stemmer=SnowballStemmer("english")): """Word and sentence tokenization function that utilizes the Snowball Stemmer Args: text (string): All the reviews for a single user concatenated into a single string stemmer (Stemmer): The stemmer to be used Returns: stems (list): The filtered and stemmed tokens """ # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [ word for sent in nltk.sent_tokenize(text) for word in wordpunct_tokenize(sent) ] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems
def create_dtm(self): dtm = [] for tweet in self.data: # Make empty row newrow = dict() for term in self.top_words.keys(): newrow[term] = 0 tweetwords = [ self.porter.stem(i.lower()) for i in wordpunct_tokenize(tweet) if i.lower() not in self.stop_words and not i.lower().startswith('http') ] for word in tweetwords: if word in self.top_words.keys(): newrow[word] += 1 dtm.append(newrow) self.dtm = dtm
def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document[2:]): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue if token in self.stopwords: continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma
def _build_vocab(self): word_index = {} for doc in self.corpus: for sentence in sent_tokenize(doc): tokens = wordpunct_tokenize(sentence) tokens = [token.lower().strip() for token in tokens] tokens = [ token for token in tokens if re.match('^[a-z]+$', token) is not None ] for token in tokens: word_index[token] = word_index.get(token, 0) + 1 filtered_word_index = {} # i= 0 for empty, 1 for OOV i = 2 for word, count in word_index.items(): if count >= Preprocess.MIN_WD_COUNT: filtered_word_index[word] = i i += 1 print('Found %s unique tokens.' % len(filtered_word_index)) return filtered_word_index
def parse_description(vid_text, nlp, parser): vid_text = sanitize_text(vid_text) raw_sentences = sentence_splitter.tokenize(vid_text) try: sentences = [ ' '.join([w for w in wordpunct_tokenize(s) if set(w) - punct_set]).replace(' .', '.') for s in raw_sentences ] # sentences = raw_sentences # print('here', sentences) # docs = [nlp(sent) for sent in sentences] # noun_phrase_chunks = { # 'chunks': [[(np.start, np.end) for np in doc.noun_chunks] for doc in docs], # 'named_chunks': [[np.text for np in doc.noun_chunks] for doc in docs] # } # constituent_parse = const_parse(vid_text, parser) constituent_parse = [ list(i)[0] for i in parser.raw_parse_sents(sentences) ] # return constituent_parse # print([s.leaves() for s in constituent_parse]) noun_phrase_chunks = np_chunker(vid_text, constituent_parse) except IndexError: # sentences = [' '.join([w for w in word_tokenize(s) if set(w) - punct_set]).replace(' .', '.') for s in raw_sentences] constituent_parse = [ list(i)[0] for i in parser.raw_parse_sents(raw_sentences) ] noun_phrase_chunks = np_chunker(vid_text, constituent_parse) pos_tags = [sent.pos() for sent in constituent_parse] # pos_tags = [(token.text, token.pos_, token.string) for token in doc] pos_tags = [item for sublist in pos_tags for item in sublist] parses = { 'noun_phrase_chunks': noun_phrase_chunks, 'pos_tags': pos_tags, } return parses
def __init__(self, fileName): ''' Parse the html document content to sentences :param htmlFileName: path of html file to be parsed ''' tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # if fileName.endswith('.htm'): # with open(fileName) as file: # self.filePath = fileName # html_doc = file.read() # soup = BeautifulSoup(html_doc, "lxml") # self.title = soup.title.string # # # get contents from the html without section titles # # ignoring tags like <h1> <h2> # paragraphs = [] # for paragraph in soup.find_all('p'): # paragraphs.append(paragraph.get_text()) # data = "\n".join(paragraphs) # self.rawLines = tokenizer.tokenize(data) if fileName.endswith('.txt'): with open(fileName) as file: self.filePath = fileName self.title = file.readline() lines = file.readlines() lines = [line.strip().replace('\xe2\x80\x83', ' ') for line in lines if line] # doc = ' '.join(lines) self.rawLines = [] for line in lines: self.rawLines += tokenizer.tokenize(line.decode('utf-8')) self.rawLines_stem = [' '.join([ps.stem(word) for word in wordpunct_tokenize(sentence)]) for sentence in self.rawLines] else: print 'Error, unable to read file', fileName
def get_individual_rhymes(sonnets): all_rhymes = [] for sonnet in sonnets: tokens = [wordpunct_tokenize(s) for s in sonnet] punct = set(['.', ',', '!', ':', ';', '?', '(', ')']) filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens] last = [ sentence[len(sentence) - 1] for sentence in filtered] # now that we have a list of the last words, check the sonnets # specifically if it is the ababcdcdefefgg or the other scheme if (len(last) == 14): pairs = [[last[0], last[2]], [last[1], last[3]], \ [last[4], last[6]], [last[5], last[7]], \ [last[8], last[10]], [last[9], last[11]], \ [last[12], last[13]]] all_rhymes += pairs else: print(f"Some weird sonnet appeared with length {len(last)}!") return all_rhymes