def parse_book(book_file): # chapter titles are all caps and only one word title_pattern = re.compile("^[A-Z]+$") book = [] chapter = [] i = 0 with open(book_file, 'r') as f: for line in f: line = line.rstrip() if line: if title_pattern.match(line): # if there's something in the chapter, put it in the book if chapter: i += 1 chapter = ' '.join(chapter) book.append(tb(chapter)) chapter = [] else: # preprocess line and put into chapter line = preprocess(line) chapter.append(line) # put the last chapter in the book i += 1 chapter = ' '.join(chapter) book.append(tb(chapter)) return(book)
def rankDocs(keywordList, doclistTuples): scores = {} docList = [tb(doc[1].decode('utf-8')) for doc in doclistTuples] for doc in doclistTuples: scores[doc[0]] = scoreDoc(keywordList, tb(doc[1].decode('utf-8')), docList) sortedDocs = sorted(scores.items(), key=lambda x: x[1], reverse = True) return sortedDocs[:10]
def setBlob(self,blob_): paragraph = filter(lambda x: x in printable, blob_) blob = tb(paragraph) newBlob = "" if(self.stemming): for word in blob.words: newBlob+=" "+(stem(word.lower())) self.blob = tb(newBlob)
def main(): # Takes in commandLine args, and sorts variables if necessary. parser = argparse.ArgumentParser(description='Analyze Blogs.', formatter_class=RawTextHelpFormatter) parser.add_argument('-b', '--blog', help='Manually enter the blog text here as a string. Formatted like:\n\nauthor: "authors name"\ntitle: "title"\nblog: "blog text"', default=None) parser.add_argument('-a', '--author', help='Enter the authors name as a string', default=None) parser.add_argument('-t', '--title', help='Enter the blogs title as a string', default=None) parser.add_argument('-i', '--inFile', help='Enter the path to a plain text file with the blog entry in it', default=None) args = parser.parse_args() # Save variables from commandline args newBlogFile = args.inFile newBlogText = args.blog newBlogAuthor = args.author newBlogTitle = args.title go = True while(go): # The below object is a dictionary of 2 dictionaries, good and bad features, and their relevant metadata. # count is the number of times blogs have been passed through. This is necessary for updates. features = {"good":{"count": 0, "words": [], "names": 0.0, "religion": 0.0, "weaponry": 0.0, "government": 0.0}, "bad": {"count": 0, "words": [], "names": 0.0, "religion": 0.0, "weaponry": 0.0, "government": 0.0}} json_data = importJSON("Writings/writings.json") # get JSON data, creating a dictionary-like object # Declaring lists of writings badBlogList = [] goodBlogList = [] # Analyze the current data in the JSON file. for blog in json_data["writings"]["bad"]: badBlogList.append(tb(blog["post"])) for blog in json_data["writings"]["good"]: goodBlogList.append(tb(blog["post"])) analysisResults = analyzeBlogs(badBlogList) features["bad"]["count"], features["bad"]["words"], features["bad"]["names"], features["bad"]["religion"], features["bad"]["weaponry"], features["bad"]["government"] = len(badBlogList), analysisResults.outputsWordsArray, analysisResults.namesScore, analysisResults.religionScore, analysisResults.weaponryScore, analysisResults.governmentScore analysisResults = analyzeBlogs(goodBlogList) features["good"]["count"], features["good"]["words"], features["good"]["names"], features["good"]["religion"], features["good"]["weaponry"], features["good"]["government"] = len(goodBlogList), analysisResults.outputsWordsArray, analysisResults.namesScore, analysisResults.religionScore, analysisResults.weaponryScore, analysisResults.governmentScore print("Current writings in database have been analyzed... \nRunning comparisons against provided writing...\n ----------------------------") newBlog = None # Analyze new file if newBlogFile is not None: newBlog = buildNewBlog(newBlogFile) elif newBlogText is not None: newBlog = buildNewBlog(None, newBlogAuthor, newBlogTitle, newBlogText) if newBlog is not None: tempFeatures = {"words": [], "names": 0.0, "religion": 0.0, "weaponry": 0.0, "government": 0.0} analyzeNewBlog(newBlog.post, goodBlogList, badBlogList, features) print ("Please enter another file for analysis. or 'quit' to quit.\n") newBlogFile = input('File path: ') if newBlogFile == "quit" or newBlogFile == "Quit" or newBlogFile == "q": go = False print("Closing program...")
def readcontent(self): ope=open('Cs.txt','r') ope1=open('Is.txt','r') ope2=open('It.txt','r') self.CS_Fild=ope.read().lower() self.Is_filed=ope1.read().lower() self.IT_field=ope2.read().lower() self.Cs=tb(self.CS_Fild) self.Is=tb(self.Is_filed) self.It=tb(self.IT_field) self.bloblist = [self.Cs,self.Is,self.It]
def extract(text): bloblist = [] with open("clean_text.csv") as f: reader = csv.DictReader(f) for row in reader: bloblist.append(tb(row['post_text'])) blob = tb(text) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) words = '' for word, score in sorted_words[:15]: words += word + ' ' return words
def analyzeNewBlog(blog, goodBlogList, badBlogList, features): # Get word densities of the new blog namesCount, religionCount, weaponryCount, governmentCount, wordCount = 0, 0, 0, 0, 0 for word in tb(blog): wordCount += 1 if word in terms.governmentTerms(): # increment count based on content to find word densities. governmentCount += 1 if word in terms.weaponsTerms(): weaponryCount += 1 if word in terms.femaleNames() or word in terms.maleNames(): namesCount += 1 if word in terms.religiousTerms(): religionCount += 1 analysisOutputs = AnalysisObject(namesCount/wordCount,religionCount/wordCount,weaponryCount/wordCount,governmentCount/wordCount, None) # Compare to the analyzed ones. scores = {"good": 0.0, "bad": 0.0} for upperKey in features: print ("\nComparing this blog to " + upperKey.upper() + " blogs:\n") for lowerKey in features[upperKey]: if lowerKey == "words": for word in features[upperKey][lowerKey]: if word[0] not in terms.stopWords(): if word[0] in blog: print ("Word found in " + upperKey + " blog: " + word[0]) scores[upperKey] += word[1] * 100 # If a word is found, update the score relative to its TFIDF score. elif lowerKey == "religion": # This next section is to compare the density of a term of the new blog compared to the density of that term in the analyzed blogs. scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.religionScore) print ("Religion variance: " + str(features[upperKey][lowerKey] - analysisOutputs.religionScore)) elif lowerKey == "government": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore) print ("Government variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.governmentScore))) elif lowerKey == "weaponry": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore) print ("Weaponry variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.weaponryScore))) elif lowerKey == "names": scores[upperKey] -= abs(features[upperKey][lowerKey] - analysisOutputs.namesScore) print ("Names variance: " + str(abs(features[upperKey][lowerKey] - analysisOutputs.namesScore))) print ("\nFinal Scores:\n" + "Bad: " + str(scores["bad"]) + "\nGood: " + str(scores["good"]) + "\n") if abs(scores["good"] - scores["bad"]) < .5: print ("This post does not trend towards 'good; or 'bad'.") else: if scores["good"] > scores["bad"]: print ("This post has been marked as 'good'.") goodBlogList.append(tb(blog)) # Add term to the blog list. If this program were running constantly, it would be included in the next baes analysis. else: print ("This post has been flagged as 'bad'.") badBlogList.append(tb(blog)) print ("\n---------------------------------------")
def get_tfidf_values(self, sentence): blob = tb(sentence) self.bloblist.append(blob) blob_list = self.bloblist[:] # blobList.append(blob) single_words = blob.words pairs = [Word(single_words[i] + ' ' + single_words[i + 1]) for i in range(len(single_words) - 1)] scores_pairs = {word: self.__tfidf__(word, blob, blob_list, 2) for word in pairs} sorted_words_pairs = sorted(scores_pairs.items(), key=lambda x: x[1], reverse=True) scores_single = {word: self.__tfidf__(word, blob, blob_list, 1) for word in blob.words} sorted_words_single = sorted(scores_single.items(), key=lambda x: x[1], reverse=True) # sorted_words = sorted(sorted_words_pairs + sorted_words_single, key=lambda x: x[1], reverse=True) ds = 0 nmd = 0 tec = 0 for i, word in enumerate(sorted_words_single): ds += self.__ds_check__(word[0]) * word[1] nmd += self.__nmd_check__(word[0]) * word[1] tec += self.__tec_check__(word[0]) * word[1] for i, word in enumerate(sorted_words_pairs): ds += self.__ds_check__(word[0]) * word[1] nmd += self.__nmd_check__(word[0]) * word[1] tec += self.__tec_check__(word[0]) * word[1] return [ds, nmd, tec]
def get_tweet_info(tweet): processed_tweet = { 'tweet_id': tweet.id_str, 'created_by_id': tweet.user.id, 'created_at': tweet.created_at, 'text': tweet.text, 'coordinates': tweet.coordinates, # Note: only returns a non-zero favorite_count for an original # tweet. We'd need to look up the original tweet itself to get # the favorite_count, which is possible. 'favorite_count': tweet.favorite_count, 'retweet_count': tweet.retweet_count # This favorited field only tells us if we, the authenticated user have # favorited this tweet, which isn't that helpful. # 'favorited': tweet.favorited, } if 'hashtags' in tweet.entities: processed_tweet['hashtags'] = tweet.entities['hashtags'] else: processed_tweet['hashtags'] = None if 'media' in tweet.entities: processed_tweet['media'] = tweet.entities['media'] else: processed_tweet['media'] = None # Get Sentiment blob = tb(tweet.text) sentiment = {'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } processed_tweet['sentiment'] = sentiment return processed_tweet
def extract(storyString): storyText = tb(storyString) results = [] for sentence in storyText.sentences: # split text into sentences results.append(analyze_sent_semantics(sentence)) return results
def features_pos_tag(self): blob = tb('.'.join([self.title,self.short,self.need,self.essay])) counts = Counter(tag for word,tag in blob.tags) total = sum(counts.values()) ratio_dict = tag_dict.copy() ratio_dict.update(dict((word, float(count)/total) for word,count in counts.items())) return tuple(map(lambda k: ratio_dict[k], tag_list))
def __init__(self,graph): self.bloblist = [] for node in graph.nodes(): try: self.bloblist.append(tb(graph.node[node]['abstract'])) except: print "No abstract for node ",node
def stemming(doc): d = toker.tokenize(doc) d = [k for k in d if k not in cachedStopWords] for i in range(0,len(d)): d[i]=lemma.lemmatize(d[i]) return tb(" ".join(d))
def buildTestData(self): self.testBloblist = {} for key, value in self.dev.iteritems(): content = '. '.join(self.dev[key]['content']) content.replace('..','.') self.testBloblist[key] = (tb(content)) self.testBloblistLength = len(self.testBloblist)
def main(): #print 'Hello there' # Command line args are in sys.argv[1], sys.argv[2] ... # sys.argv[0] is the script name itself and can be ignored dataList = [] for f in os.listdir('documents'): filePath = 'documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) # with open(filePath) as f: # doc = slate.PDF(f) # print doc[1] # exit() #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" #print dataList #for i in dataList: # print i cachedStopWords = stopwords.words("english") combined = ' '.join(dataList) #print combined bloblist = [tb(combined)] for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')} #print scores sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) #print sorted_words for word, score in sorted_words: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
def main(): summary_text = eval(open("summaryList.txt").read()) input_text = eval(open("input.txt").read()) summarycommentlist = [] inputcommentlist = [] for comment in summary_text: summarycommentlist.append(tb(' '.join(get_nonstop_words(comment)))) for comment in input_text: inputcommentlist.append(tb(' '.join(get_nonstop_words(comment)))) tf_idf_summary = calculate_tfidf_average(summarycommentlist) tf_idf_input = calculate_tfidf_average(inputcommentlist) print "Retention Rate = ", tf_idf_summary/tf_idf_input
def rankSentences(keywordList, doclist): scores={} docList=[tb(doc[1].decode('utf-8')) for doc in doclistEntries] keywordIDFs={} for w in keywordList: keywordIDFs[w]=idf(w, docList) for i in enumerate(doclist): text=tb(doc[1].decode('utf-8')) sums=0 for w in keywordList: sums=sums+(tf(w,text)*keywordIDFs[w]) scores=[doc[0]]={'score':sums,'object': Sentence} bestMatches=sorted(scores.items(),key=lambda x: x[1]['score'],reverse=True) return bestMatches[:0]
def make_bloblist(bloblist): f = open('/home/ashar/nltk_data/corpora/abc/rural.txt','r') var = f.read() var = var.lower() splat=var.split("\n\n") for i in splat: temp=tb(i.decode('utf-8')) bloblist.append(temp)
def preprocess(doc, stopwordList): blob = tb(doc.lower()) blobWords = blob.words blobNoStop = [w for w in blobWords if w not in stopwordList] stems = blobNoStop for i in range(len(stems)): stems[i] = PorterStemmer().stem_word(stems[i]) clean = " ".join(stems) return clean
def input_tags(infile): with codecs.open(infile,'r',encoding='utf-8',errors='ignore') as f: x = [] for line in f: pid = line.split(',')[0] tag = line.split(',')[1] try: x = tb(str(pid)) + '\t' + tb(str(tag)) except: x = tb(str(pid)) try: both = {pid: x.split('\t')[1]} except: continue tag_dict.update(both) tag_dict_values.append(tag) words_tuple = pid,tag ww.append(words_tuple)
def parseQuery(q): blob = tb(q.lower()) posTags = blob.tags keepTags = ['NNP', 'NNPS', 'NN', 'NNS', 'JJ', 'JJR', 'JJS', 'CD'] keywords = [w[0] for w in posTags if w[1] in keepTags] stems = keywords for i in range(len(stems)): stems[i] = PorterStemmer().stem_word(stems[i]).encode('utf-8') return stems
def analyze_sent_semantics(sentenceBlob): tagged_s = tb(" ".join(prepare_text(sentenceBlob))).tags sent_tree = bigram_chunker.parse(tagged_s) sent_tree = treeToJSON(sent_tree) # convert to format that we can work with # verify the verb phrases for p in range(0, len(sent_tree)): phrase = sent_tree[p] if phrase["label"] == "VP": verbCount = 0 for w in phrase["text"]: if w[1].find("VB") > -1: verbCount += 1 if verbCount == 0: phrase["label"] = "PSEUDO-VP" # print(sent_tree) predicted_subject = [] predicted_verb = str() predicted_actionable_noun = str() for ph in range(0, len(sent_tree)): p = sent_tree[ph] if p["label"] == "NP" or (p["label"] == "PP" and (sent_tree[ph-1]["label"] == "NP" and ph-1 > -1)): for t in p["text"]: predicted_subject.append(t) if p["label"] == "VP": predicted_verb = stringifyTree(p["text"]) # iterate over everything after the predicate for o_i in range(ph, len(sent_tree)): o = sent_tree[o_i] if o["label"] == "NP" or (o["label"] == "PP" and (sent_tree[o_i-1]["label"] == "NP" and o_i-1 > -1)): predicted_actionable_noun = o["text"] break if o["label"] == "PP" and stringifyTree(sent_tree[o_i-1]["text"]) == predicted_verb: predicted_verb += " " + stringifyTree(o["text"]) break # print("Subject: " + stringifyTree(predicted_subject)) # what we think the subject might be # print("Predicate: " + predicted_verb) # print("Object: " + stringifyTree(predicted_actionable_noun)) semantics_analysis = { "raw_subject": stringifyTree(predicted_subject), "simple_subject": simplifyTree(predicted_subject), "predicate": predicted_verb, "raw_object": stringifyTree(predicted_actionable_noun), "simple_object": simplifyTree(predicted_actionable_noun) } return semantics_analysis
def normalize(data): bloblist = [] for it in data: lines = list(map(lambda x: str(x.string), it[1])) inp_string = tokenizer.array_to_string(lines) tok = tokenizer.tokenize_text(inp_string) tok = tokenizer.remove_digits(tok) tok = tokenizer.get_normal_forms(tok) bloblist.append(tb(tokenizer.array_to_string(tok))) log(" normalize " + str(it[0])) return bloblist
def __init__(self, block_of_text, publication, key): if publication.is_booklike: self.publication = publication self.summary = block_of_text[0] self.keywords = block_of_text[1] self.logic = block_of_text[2] self.pages = block_of_text[3] self.text = block_of_text[4] publication.Biblio.textblobcorpus.append(tb(self.text)) for word in self.keywords.split(", "): key.add_word(word, self) else: self.publication = publication self.summary = block_of_text[0] self.keywords = block_of_text[1] self.logic = block_of_text[2] self.text = block_of_text[3] publication.Biblio.textblobcorpus.append(tb(self.text)) for word in self.keywords.split(", "): key.add_word(word, self)
def extract(self,graph,abs1,abs2, num_concepts): textblob = '' textblob += abs1.lower() textblob += abs2.lower() textblob = tb(textblob) scores = {word: tfidf(word,textblob,self.bloblist) for word in textblob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1]) for word,score in sorted_words[:-1 * num_concepts]: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) a = [x[0] for x in sorted_words[:-1*num_concepts]] return a[-num_concepts:]
def calculate_tfidf_all_docs(list_of_docs): if len(list_of_docs) == 0: return bloblist = [ tb(doc.full_text_no_stop) for doc in list_of_docs] for index in range(len(bloblist)): words_checked = [] for word in bloblist[index].words: if word not in words_checked: SearchTerm.SearchTerm(tfidf = tfidf(word, bloblist[index], bloblist), word = word, document = list_of_docs[index]) words_checked.append(word) return
def extractSummary(self, devPath, outFileName): self.dev = json.load(open(devPath, 'r')) self.buildTestData() out = {} c = {0:0,1:0,2:0} for i, blob in self.testBloblist.iteritems(): cn = self.getCategoryNumber(blob) c[cn] += 1 sentenceList = self.reg.split(unicode(blob)) sentenceRankDict = {} tfw = self.tf(blob) for j in range(0,len(sentenceList)): sentence = tb(sentenceList[j]) sentenceRank = 0 for word in sentence.words: word_stem = stemmer_test.get_stem(word) if word_stem in self.wordDfDict: tf = tfw[word_stem] df = self.wordDfDict[word_stem] tfIdf = tf * self.computeIdf(df+1) gss = 0 if word in self.gss: gss = tf*self.gss[word][cn] sentenceRank += (tfIdf + gss) if sentenceRank != 0: sentenceRankDict[sentence] = [sentenceRank, j] topSentences = sorted(sentenceRankDict.items(), key=lambda x: x[1][0], reverse=True) #deciding topSentencesToFile = "" #select 20% of article, with min = 4 , max = 6 sentences numberOfSentence = int(math.floor(0.2*len(sentenceList))) if numberOfSentence > 6: numberOfSentence = 6 elif numberOfSentence < 4: numberOfSentence = 4 topSentences = sorted(topSentences[:numberOfSentence], key=lambda x: x[1][1]) for sentence, sentenceNumber in topSentences: topSentencesToFile += format(sentence)+". \n" out[i] = {"text" : topSentencesToFile} articleNumber = i sentencesToFile = "" for sentence in sentenceList: sentencesToFile += format(sentence)+". \n" t = outFileName.split(".")[0] self.writeToFile(str(articleNumber)+t, sentencesToFile, topSentencesToFile) print c outfileName = "systemStemmer_"+outFileName with open(outfileName, 'w') as outfile: json.dump(out, outfile)
def createDict(jsonData): Dict = {} Blob = [] for i in range(0, len(jsonData)): content = ''.join(jsonData[i]['content']) Blob.append(tb(content)) for i, blob in enumerate(Blob): for word in set(blob.words): if word not in Dict: Dict[word] = 0 Dict[word] += 1.0 return Dict
def analyze_sent(sent, debug=True): ngtags = tb(sent).tags sScore = 0 if debug==True: print ngtags for w,pos in ngtags: if pos in ['DT','VBZ','PRP', 'NNS']: continue if w in dat: s=(5000-int(dat[w]['happiness_rank']))*1.0/5000 print w, ': ', s sScore+=s return sScore
def tfidfWrapper(tweets): bloblist = [tb(tweet) for tweet in tweets] scores = [] for i, blob in enumerate(bloblist): # print("document -->{}".format(blob)) score = {word: tfidf(word, blob, bloblist) for word in blob.words} # sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) # for word, score in sorted_words[:3]: # print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) scores.append(score) return scores
def main(input_data): file = input_data document1 = tb(""" More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends. Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you """) top_words_question = [] top_words_question = tfidf_main(document1) essays = pd.read_csv(file) download_dir = "/home/compute/work/aee/essay_evaluation_codes/domain1/0.1/all_0.1.csv" #where you want the file to be downloaded to _csv = open(download_dir, "w") _csv_file = open(download_dir, "w") # Headers for CSV headers = [ 'Essay', 'pos_unique', 'misspelled words', 'coordinating_conjuctions', 'words', 'characters', 'min_st_sum', 'max_st_sum', 'c_centrality', 'density_diff', 'top_words_comp', 'common_length' ] writer = csv.DictWriter(_csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL, fieldnames=headers, extrasaction='ignore') writer.writeheader() # Counter variable to know essay index essay_counter = 1 for index, row in essays.iterrows(): s1 = row['essay'] s1 = s1.decode("utf-8", "ignore") s1 = s1.encode('ascii', 'ignore') #results=get_results(s1) #negate_matrix(results, factor=1) top_words_essay = [] s1_tfidf = tb(s1) top_words_essay = tfidf_main(s1_tfidf) comp_results = compare_topwords(top_words_question, top_words_essay) print 'top words diff' print comp_results pos_unique, count, ex_there_length, s_adj_length, pdt_length, c_conj_length, c_adj_length, s_adv_length, words_len, characters, common, average = syntax_results( s1) results, results_all = get_results(s1) # generate_graph(results_all, view = 0) # generate_graph(results, view = 0) negate_results = negate_matrix(results, factor=1) # generate_graph(negate_results, view = 0) mst = minimum_spanning_tree( csr_matrix(results_all)).toarray().astype(float) # generate_graph(mst, view=3) mst_sum1 = str(mst_sum(list(mst))) max_st = minimum_spanning_tree( csr_matrix(negate_matrix(results_all, 100))).toarray().astype(float) # generate_graph(max_st, view=3) max_st_sum1 = str(max_st_sum(list(max_st))) A_neg_all = numpy.matrix(negate_matrix(results_all, factor=1)) G_neg_all = nx.from_numpy_matrix(A_neg_all) A_neg = numpy.matrix(negate_matrix(results, factor=1)) G_neg = nx.from_numpy_matrix(A_neg) c_centrality = str( closeness_centrality(G_neg, u=None, distance=None, normalized=True)) # diameter1 =str(diameter(G_neg,e=None)) final_density = (density(G_neg_all)) - (density(G_neg)) # radius1=str(radius(G_neg,e=None)) # center1= str(center(G_neg_all,e=None)) # dispersion1 = str(dispersion(G_neg, u=None, v=None, normalized=True, alpha=1.0, b=0.0, c=0.0)) # eigen = str(eigen1(G_neg_all)) output = { 'Essay': s1, 'pos_unique': str(pos_unique), 'misspelled words': str(count), 'coordinating_conjuctions': str(c_conj_length), 'words': str(words_len), 'characters': str(characters), 'common_length': str(common), 'min_st_sum': mst_sum1, 'max_st_sum': max_st_sum1, 'c_centrality': c_centrality, 'density_diff': final_density, 'top_words_comp': comp_results } writer.writerow(output) essay_counter = essay_counter + 1 print "essay number" print essay_counter
def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) document1 = tb(""" Queen to receive £6m pay increase from public funds The Sovereign Grant, which pays for the salaries of her household, official travel and upkeep of palaces, is to increase by more than £6m in 2018/19. It comes as accounts revealed the Queen's official net expenditure last year increased by £2m, to almost £42m. Sir Alan Reid, Keeper of the Privy Purse, said the Queen represented "excellent value for money".""") document2 = tb("""Python, from the Greek word (πύθων/πύθωνας), is a genus of nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are recognised.[2] A member of this genus, P. reticulatus, is among the longest snakes known.""") document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly manufactured by Colt's Manufacturing Company of Hartford, Connecticut. It is sometimes referred to as a "Combat Magnum".[1] It was first introduced in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued Colt Python targeted the premium revolver market segment. Some firearm collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob.words) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) doc1 = tb("""배우 남궁민(39)이 SBS TV 드라마 '조작'(극본 김현정, 연출 이정흠)에 출연 확정했다고 소속사 935엔터테인먼트가 20일 밝혔다. '조작'은 사회 부조리를 파헤치는 기자들의 이야기를 그린다. 남궁민은 사고뭉치 기자 '한무영'을 맡는다. 기자였던 형이 비리를 고발하다 억울하게 죽는 모습을 본 후 복수를 위해 직접 기자가 된 인물이다. 소속사는 "전작 '김과장'이 많은 사랑을 받아 차기작을 결정하는 데 많은 고민이 있었다. '조작'은 '김과장' 때와 달리 남궁민의 진지하고 카리스마 넘치는 매력을 보여줄 드라마"라고 말했다. 한편 '조작'은 2015년 방송된 SBS 2부작 드라마 '너를 노린다'에서 호흡을 맞춘 이정흠 PD와 김현정 작가가 다시 한번 의기투합한 작품이다. 드라마는 '엽기적인 그녀' 후속으로 7월 방송 예정이다. """) doc2 = tb("""[스포츠조선닷컴 이지현 기자] tvN 금토드라마 '시카고 타자기' 제작진이 5, 6회 방송에 대한 힌트를 전해 관심을 모으고 있다. 스토리가 전개될수록 흥미를 더하고 있는 '시카고 타자기'는 앞서 지난 3, 4회에서 유령작가 유진오(고경표 분)의 존재를 발견하고, 이로 인해 혼란을 거듭해 가는 스타작가 한세주(유아인 분)의 모습이 그려졌다. 특히 시종일관 여유로움을 잃지 않는 유진오의 캐릭터가 흥미를 더했다. 유진오는 진짜 그의 이름이 아닌, 한세주의 집필실에 걸려 있던 극작가 '유진 오닐'의 초상화를 보고 급조한 이름이었고, 한세주가 그를 밧줄로 꽁꽁 묶어 두었지만 그는 손쉽게 탈출했다. 또한 그는 전설(임수정 분)을 따라다니며 먼 발치에서 아련하게 바라보는 모습으로 캐릭터에 대한 호기심을 더욱 끌어올렸다. 제작진은 "예민하고 까칠한 한세주가 여유로운 성격의 유진오와 부딪치며 만들어지는 호흡이 굉장히 재미있다. 유진오라는 매력적인 캐릭터의 등장은 전설에 대한 한세주의 감정을 본격적으로 시작하게 하는 기폭제 역할도 하게 될 것"이라고 밝혔다. 이어 "이번 주 '시카고 타자기' 5, 6회에서는 그간 제시됐던 복선들의 물꼬가 터질 예정이다. 시청자가 가장 궁금해 하는 두 가지가 해소되면서 스토리의 퍼즐 조각이 조금씩 맞춰질 것"이라고 전해 기대감을 높였다. 한편 tvN '시카고 타자기'는 슬럼프에 빠진 베스트셀러 작가 '한세주'와 그의 이름 뒤에 숨은 유령작가 '유진오', 한세주의 열혈 팬에서 안티 팬으로 돌변한 작가 덕후 '전설', 그리고 의문의 오래된 타자기와 얽힌 세 남녀의 미스터리한 앤티크 로맨스를 그린다. '킬미 힐미', '해를 품은 달'의 진수완 작가, '공항 가는 길' 김철규 감독을 비롯해 유아인, 임수정, 고경표 등 최고의 배우들이 모인 드라마로 뜨거운 관심을 얻고 있다. 21일(금) 저녁 8시 5회 방송. """) doc3 = tb("""[스포츠조선닷컴 정유나 기자] '발칙한 동거 빈방있음'의 집주인 피오가 '누나 전용 안전바'로 변신했다. 김신영-홍진영과 함께한 첫 동거에서 순둥 막둥이 '피요미'로 귀여움을 한 몸에 받았던 집주인 피오가 놀이기구 위 겁에 질린 누나들을 다독이며 자신의 무서움도 잊은 채 '누나 전용 안전바'로 변신하는 '기습 심쿵 남동생 스킬'을 보여준 것. 오는 21일 방송되는 MBC 스타 리얼 동거 버라이어티 '발칙한 동거 빈방있음'(연출 최윤정/ 이하 발칙한 동거)에서는 프로 반칙러이자 겁쟁이 3인방으로 등극한 현실 삼남매 케미 3인방 피오-김신영-홍진영의 좌충우돌 놀이공원 봄 나들이 2탄이 공개된다. 지난 방송에서 무서운 놀이기구 탑승을 피하기 위해 각종 반칙을 일삼으며 웃음을 자아냈던 피오-김신영-홍진영이 또 다시 놀이기구에 탑승한 모습이 포착돼 폭소를 유발하고 있는 가운데 누나들의 마음을 심쿵하게 만드는 피오의 모습이 시선을 강탈한다. 탑승 전부터 홍진영은 "언니 때문에 지금 이지경인 거 아니야!"라며 무서움을 참지 못하고 설움을 폭발 시켰고, 김신영은 잔뜩 겁에 질린 멍한 표정으로 놀이기구에 올랐다. 공개된 사진 속에는 가장 겁이 많은 홍진영과 함께 나란히 롤러코스터에 탑승한 피오가 "내가 잡고 있어! 괜찮아!"라며 잔뜩 고개를 숙인 홍진영을 다독이며, 아찔한 높이에 롤러코스터가 도착하자 홍진영 앞으로 자신의 팔을 내밀어 '누나 전용 안전바'를 자처한 모습이 담겨 있어 보는 이들을 셀레게 만든다. 또한 먹이를 먹는 기린의 모습에 깜짝 놀란 귀여운 피오의 모습과 함께 홀로 롤러코스터에 탄 큰 누나 김신영이 세차게 부는 바람을 온 얼굴로 받아내는 모습까지 공개돼 보는 이들을 폭소케 만들고 있다. '심쿵 남동생 스킬'을 선보인 피오의 반전 매력과 언제나 흥과 에너지 넘치는 현실 삼남매 케미를 보여주고 있는 이들의 폭소 유발 현장이 담긴 놀이공원 봄 나들이 2탄은 오는 21일 방송되는 '발칙한 동거 빈방있음'을 통해 확인할 수 있다.
def clean_words(rm_words, query): wrds = query.lower() for word in rm_words: wrds = wrds.replace(word, "") return wrds remove_words = [ "harry", "potter", "ron", "weasley", "hermione", "granger", "hogwarts", "," ] potter_years = [2011, 2010, 2009, 2007, 2005, 2004, 2002, 2001] potter_summaries = [] for year in potter_years: r = requests.get( 'http://www.omdbapi.com/?apikey=ad492b8&t=harry+potter&y=' + str(year) + '&plot=full') plot = unicodedata.normalize('NFKD', r.json()["Plot"]).encode('ascii', 'ignore') potter_summaries.append(tb(clean_words(remove_words, plot))) for i, blob in enumerate(potter_summaries): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, potter_summaries) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:3]: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
def __init__(self, dc): s = requests.session() for d in dc.DocList: #break each doc into its entiites ,remove stopwords candidate = re.sub("[^\w]", " ", d.line).split() #get document line by line ent = [] con = [] for c in candidate: if c in stopwords: continue else: if isConcept(c): con.append(c) else: ent.append(c) ent = list( set(ent)) #repetitions are not significant in short text uselessEnt = [] EtoConPerDoc = [] for e in ent: enTocon = getConcepts(e, s) # print("Great", enTocon) # for k, v in enTocon.items(): # if k in self.pecConcepts: # self.pecConcepts[k].append(v) # else: # l = [] # l.append(v) # self.pecConcepts.update({k:l}) enTocon = list(enTocon.keys()) if not enTocon: uselessEnt.append(e) else: EtoConPerDoc += enTocon #Per Document Concept Disambiguation lister = [] for l in EtoConPerDoc: lister.append(l) Cdict = dict(collections.Counter(lister)) Cdict = {k: v for k, v in Cdict.items()} #if v > 1 (optional) EtoConPerDoc = list(Cdict.keys()) ent = list(set(ent) - set(uselessEnt)) EntoCon = list(set(EtoConPerDoc + list(set(con)))) #self.E_bloblist.append(tb(' '.join(ent))) #self.C_bloblist.append(tb(' '.join(con))) #self.AllConcepts.extend(EntoCon) self.E_bloblist.append(tb(' '.join(ent))) print("Hello", self.E_bloblist) self.C_bloblist.append(tb(' '.join(EntoCon))) self.AllConcepts.extend(EntoCon) print(self.AllConcepts) for i, blob in enumerate(self.E_bloblist): E_scores = { word: tfidf(word, blob, self.E_bloblist) for word in blob.words } self.Entities.append(E_scores) # pC = {} # for k,v in self.pecConcepts.items(): # pC.update({k:sum(v)/len(v)}) # self.pecConcepts = pC for blob in self.C_bloblist: C_scores = { word: tfidf(word, blob, self.C_bloblist) for word in blob.words } #C_scores_update = {k:v*self.pecConcepts[k] for k,v in C_scores.items() if k in self.pecConcepts} #amplifying the unambiguous concepts #C_scores.update(C_scores_update) self.Concepts.append(C_scores)
return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) #product of tf and idf which computes TF-IDF score def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) # load credit card usage categories data df = pd.read_csv('data/infected/infected.csv') # store the data in a bloblist bloblist = [] for index, row in df.iterrows(): bloblist.append(tb(str(row[3]) + str(row[4]))) #for each customer calculate most important 5 words based on credit card usage category list. df_result = pd.DataFrame(columns=['word', 'score']) for i, blob in enumerate(bloblist): scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) res_str = '' for word, score in sorted_words: #tmp_str = " | %s:%s" % (word,round(score,4)) #res_str += tmp_str row = {'word': word, 'score': score} #print(row) df_result = df_result.append(row, ignore_index=True)