def summaryGenerator(class_name, tweets, folder_mode, ranker): tweets=tweetCleaner(tweets) #Some cleaning #print "Set of Tweets=>", len(tweets) #tweetlist=[tweet for tweet in tweets] #print "List of tweets", tweetlist genSentences=wg.retrieveNewSentences(tweets, stopwords) wordScores=createDict(mainDatafolder+"/"+folder_mode['Extract']+"/"+class_name+"/"+class_name+"_weight.txt") #emptysentences=[sent for sent in genSentences if len(sent.strip())==0] #print "EMPTY::::", len(emptysentences) ''' This is where the ILP works to select the best sentences and form the summary ''' finalSentencesRetained=wg.solveILP(genSentences,wordScores, lm, stopwords, ranker, intraGenSimThreshold=0.25, l_max=200 ) return finalSentencesRetained
def summaryGenerator(class_name, tweets, folder_mode, ranker): tweets = tweetCleaner(tweets) #Some cleaning #print "Set of Tweets=>", len(tweets) #tweetlist=[tweet for tweet in tweets] #print "List of tweets", tweetlist genSentences = wg.retrieveNewSentences(tweets, stopwords) wordScores = createDict(mainDatafolder + "/" + folder_mode['Extract'] + "/" + class_name + "/" + class_name + "_weight.txt") #emptysentences=[sent for sent in genSentences if len(sent.strip())==0] #print "EMPTY::::", len(emptysentences) ''' This is where the ILP works to select the best sentences and form the summary ''' finalSentencesRetained = wg.solveILP(genSentences, wordScores, lm, stopwords, ranker, intraGenSimThreshold=0.25, l_max=200) return finalSentencesRetained
print "Project dir", PROJECT_DIR RESOURCES_DIR=PROJECT_DIR+"/"+"resources/" ### THE actual work happens here##### def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords=WGGraph.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm') #numClusters=[25,30,35,40,45,50] numClusters=[1] allEvents=os.listdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data") absdir="abstracts" if not os.path.exists(absdir): os.makedirs(absdir) for eventfile in allEvents: #if eventfile !="kuwait_number_Cluster.txt": # continue if os.path.isdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data/"+eventfile): continue
def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [ sentence[0].capitalize() + sentence[1:] for sentence in sentences ] string2 = '. '.join(sentences2) return string2 english_postagger = POSTagger(RESOURCES_DIR + 'jars/english-left3words-distsim.tagger', RESOURCES_DIR + 'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords = WGGraph.load_stopwords(RESOURCES_DIR + "resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm') #numClusters=[25,30,35,40,45,50] numClusters = [1] allEvents = os.listdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data") absdir = "abstracts" if not os.path.exists(absdir): os.makedirs(absdir) for eventfile in allEvents: #if eventfile !="kuwait_number_Cluster.txt": # continue if os.path.isdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data/" + eventfile):
import kenlm, codecs from sentenceRanker import createDict reload(sys) sys.setdefaultencoding('utf8') PROJECT_DIR=os.path.dirname(__file__)+"/../../" print "Project dir", PROJECT_DIR RESOURCES_DIR=PROJECT_DIR+"resources/" mainDatafolder=RESOURCES_DIR+"Summarization/" ### THE actual work happens here##### #english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords=wg.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm') folder_mode = {'Extract':'AIDR_Extract', 'Original':'AIDR_Original'} rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 def tweetCleaner(tweets): p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from tweets p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex
import re, sys import kenlm, codecs from sentenceRanker import createDict reload(sys) sys.setdefaultencoding('utf8') PROJECT_DIR = os.path.dirname(__file__) + "/../../" print "Project dir", PROJECT_DIR RESOURCES_DIR = PROJECT_DIR + "resources/" mainDatafolder = RESOURCES_DIR + "Summarization/" ### THE actual work happens here##### #english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords = wg.load_stopwords(RESOURCES_DIR + "resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm') folder_mode = {'Extract': 'AIDR_Extract', 'Original': 'AIDR_Original'} rankingModes = {"C": "Centroid", "TR": "textrank", "CW": "contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [ sentence[0].capitalize() + sentence[1:] for sentence in sentences ] string2 = '. '.join(sentences2) return string2