Ejemplo n.º 1
0
def summaryGenerator(class_name, tweets, folder_mode, ranker):
    tweets=tweetCleaner(tweets) #Some cleaning
    #print "Set of Tweets=>", len(tweets)
    #tweetlist=[tweet for tweet in tweets]
    #print "List of tweets", tweetlist
    genSentences=wg.retrieveNewSentences(tweets, stopwords)
    wordScores=createDict(mainDatafolder+"/"+folder_mode['Extract']+"/"+class_name+"/"+class_name+"_weight.txt")

    #emptysentences=[sent for sent in genSentences if len(sent.strip())==0]
    #print "EMPTY::::", len(emptysentences)
    '''
    This is where the ILP works to select the best sentences and form the summary
    '''
    finalSentencesRetained=wg.solveILP(genSentences,wordScores,
                                            lm, 
                                            stopwords, 
                                            ranker,
                                            intraGenSimThreshold=0.25, 
                                            l_max=200
                                            )
    
    return finalSentencesRetained
Ejemplo n.º 2
0
def summaryGenerator(class_name, tweets, folder_mode, ranker):
    tweets = tweetCleaner(tweets)  #Some cleaning
    #print "Set of Tweets=>", len(tweets)
    #tweetlist=[tweet for tweet in tweets]
    #print "List of tweets", tweetlist
    genSentences = wg.retrieveNewSentences(tweets, stopwords)
    wordScores = createDict(mainDatafolder + "/" + folder_mode['Extract'] +
                            "/" + class_name + "/" + class_name +
                            "_weight.txt")

    #emptysentences=[sent for sent in genSentences if len(sent.strip())==0]
    #print "EMPTY::::", len(emptysentences)
    '''
    This is where the ILP works to select the best sentences and form the summary
    '''
    finalSentencesRetained = wg.solveILP(genSentences,
                                         wordScores,
                                         lm,
                                         stopwords,
                                         ranker,
                                         intraGenSimThreshold=0.25,
                                         l_max=200)

    return finalSentencesRetained
Ejemplo n.º 3
0
print "Project dir", PROJECT_DIR
RESOURCES_DIR=PROJECT_DIR+"/"+"resources/"
### THE actual work happens here#####
def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
    string2 = '. '.join(sentences2)
    return string2




english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8')
#langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa")

stopwords=WGGraph.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat")  
lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm')

#numClusters=[25,30,35,40,45,50]
numClusters=[1]
allEvents=os.listdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data")
absdir="abstracts"

if not os.path.exists(absdir):
    os.makedirs(absdir)
for eventfile in allEvents:
    #if eventfile !="kuwait_number_Cluster.txt":
    #    continue
    
    if os.path.isdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data/"+eventfile):
        continue
Ejemplo n.º 4
0
def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [
        sentence[0].capitalize() + sentence[1:] for sentence in sentences
    ]
    string2 = '. '.join(sentences2)
    return string2


english_postagger = POSTagger(RESOURCES_DIR +
                              'jars/english-left3words-distsim.tagger',
                              RESOURCES_DIR + 'jars/stanford-postagger.jar',
                              encoding='utf-8')
#langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa")

stopwords = WGGraph.load_stopwords(RESOURCES_DIR +
                                   "resources/stopwords.en.dat")
lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm')

#numClusters=[25,30,35,40,45,50]
numClusters = [1]
allEvents = os.listdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data")
absdir = "abstracts"

if not os.path.exists(absdir):
    os.makedirs(absdir)
for eventfile in allEvents:
    #if eventfile !="kuwait_number_Cluster.txt":
    #    continue

    if os.path.isdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data/" +
                     eventfile):
Ejemplo n.º 5
0
import kenlm, codecs
from sentenceRanker import createDict

reload(sys)  
sys.setdefaultencoding('utf8')


PROJECT_DIR=os.path.dirname(__file__)+"/../../"
print "Project dir", PROJECT_DIR
RESOURCES_DIR=PROJECT_DIR+"resources/"

mainDatafolder=RESOURCES_DIR+"Summarization/"
### THE actual work happens here#####
#english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8')
#langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa")
stopwords=wg.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat")  
lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm')

folder_mode = {'Extract':'AIDR_Extract', 'Original':'AIDR_Original'}
rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"}


def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
    string2 = '. '.join(sentences2)
    return string2

def tweetCleaner(tweets):
    p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from tweets
    p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex
Ejemplo n.º 6
0
import re, sys
import kenlm, codecs
from sentenceRanker import createDict

reload(sys)
sys.setdefaultencoding('utf8')

PROJECT_DIR = os.path.dirname(__file__) + "/../../"
print "Project dir", PROJECT_DIR
RESOURCES_DIR = PROJECT_DIR + "resources/"

mainDatafolder = RESOURCES_DIR + "Summarization/"
### THE actual work happens here#####
#english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8')
#langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa")
stopwords = wg.load_stopwords(RESOURCES_DIR + "resources/stopwords.en.dat")
lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm')

folder_mode = {'Extract': 'AIDR_Extract', 'Original': 'AIDR_Original'}
rankingModes = {"C": "Centroid", "TR": "textrank", "CW": "contentWeighing"}


def sentenceCapitalize(sent):
    sentences = sent.split(". ")
    sentences2 = [
        sentence[0].capitalize() + sentence[1:] for sentence in sentences
    ]
    string2 = '. '.join(sentences2)
    return string2