print "Project dir", PROJECT_DIR RESOURCES_DIR=PROJECT_DIR+"/"+"resources/" ### THE actual work happens here##### def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords=WGGraph.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm') #numClusters=[25,30,35,40,45,50] numClusters=[1] allEvents=os.listdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data") absdir="abstracts" if not os.path.exists(absdir): os.makedirs(absdir) for eventfile in allEvents: #if eventfile !="kuwait_number_Cluster.txt": # continue if os.path.isdir(RESOURCES_DIR+"old_Clusters/"+"Cluster_Data/"+eventfile): continue
import kenlm, codecs from sentenceRanker import createDict reload(sys) sys.setdefaultencoding('utf8') PROJECT_DIR=os.path.dirname(__file__)+"/../../" print "Project dir", PROJECT_DIR RESOURCES_DIR=PROJECT_DIR+"resources/" mainDatafolder=RESOURCES_DIR+"Summarization/" ### THE actual work happens here##### #english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords=wg.load_stopwords(RESOURCES_DIR+"resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR+'resources/lm-3g.klm') folder_mode = {'Extract':'AIDR_Extract', 'Original':'AIDR_Original'} rankingModes={"C":"Centroid","TR":"textrank", "CW":"contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences] string2 = '. '.join(sentences2) return string2 def tweetCleaner(tweets): p=re.compile(r'http?:\/\/.*[\s\r\n]*', re.DOTALL) #Regex to remove http from tweets p2=re.compile(r'(^|\s)#.+?\s', re.DOTALL) #Regex
def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [ sentence[0].capitalize() + sentence[1:] for sentence in sentences ] string2 = '. '.join(sentences2) return string2 english_postagger = POSTagger(RESOURCES_DIR + 'jars/english-left3words-distsim.tagger', RESOURCES_DIR + 'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords = WGGraph.load_stopwords(RESOURCES_DIR + "resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm') #numClusters=[25,30,35,40,45,50] numClusters = [1] allEvents = os.listdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data") absdir = "abstracts" if not os.path.exists(absdir): os.makedirs(absdir) for eventfile in allEvents: #if eventfile !="kuwait_number_Cluster.txt": # continue if os.path.isdir(RESOURCES_DIR + "old_Clusters/" + "Cluster_Data/" + eventfile):
import re, sys import kenlm, codecs from sentenceRanker import createDict reload(sys) sys.setdefaultencoding('utf8') PROJECT_DIR = os.path.dirname(__file__) + "/../../" print "Project dir", PROJECT_DIR RESOURCES_DIR = PROJECT_DIR + "resources/" mainDatafolder = RESOURCES_DIR + "Summarization/" ### THE actual work happens here##### #english_postagger = POSTagger(RESOURCES_DIR+'jars/english-left3words-distsim.tagger',RESOURCES_DIR+'jars/stanford-postagger.jar', encoding='utf-8') #langModel=ARPALanguageModel("resources/lm_giga_20k_nvp_3gram.arpa") stopwords = wg.load_stopwords(RESOURCES_DIR + "resources/stopwords.en.dat") lm = kenlm.LanguageModel(RESOURCES_DIR + 'resources/lm-3g.klm') folder_mode = {'Extract': 'AIDR_Extract', 'Original': 'AIDR_Original'} rankingModes = {"C": "Centroid", "TR": "textrank", "CW": "contentWeighing"} def sentenceCapitalize(sent): sentences = sent.split(". ") sentences2 = [ sentence[0].capitalize() + sentence[1:] for sentence in sentences ] string2 = '. '.join(sentences2) return string2