def getSalientSets(self, lang, mxSetSize=1, AFreqP=0.20, OFreqP=0.0625): sets = dict() words = self.text.replace('\n',' ') words = words.replace('\t',' ') words = words.replace('.', '') words = words.replace(',', '') words = words.replace('/', '') words = words.replace('(', '') words = words.replace(')', '') words = words.replace(';', '') words = words.replace('\"', '') words = words.replace('?', '') words = words.replace('!', '') words = words.replace('[', '') words = words.replace(']', '') words = words.split(' ') setlen = 0 l = LanguageInfoModel_Mongo() linfo = l.getLanguage(self.language_model.lang) AFreq = linfo["articleCount"]*AFreqP OFreq = linfo["maxFreq"]*OFreqP ret = [] for w in self.genWordSets(mxSetSize): wdata = self.language_model.getWord(w[0]) if wdata != None: wAFreq = wdata.articleCount() wOFreq = wdata.getFreq() articleSaliencyScore = 1 - float(self.words[w[0]].freq)/float(len(self.words.keys())) if wOFreq < OFreq and wAFreq < AFreq: ret.append([w[0], articleSaliencyScore]) return [ret, self]
def generate_model(lang, sites, mxParse=-1, mxSetSize=3): model = LanguageModel(lang) mongo = LanguageModel_Mongo("", lang, None) parsed = 0 articleDB = ArticleDB() while (parsed < mxParse or (mxParse == -1 and parsed < articleDB.count())): a = articleDB.get(index=parsed) txt = "" #' '.join(a.get('text','')) adate = ' '.join(a.get('time', '')) url = "" #''.join(a.get('url','')) atitle = "" if isinstance(a.get('url', []), list): url = ' '.join(a.get('url', '')) elif isinstance(a.get('url', ""), basestring): url = a.get('url', "") if isinstance(a.get('text', []), list): txt = ' '.join(a.get('text', '')) elif isinstance(a.get('text', ""), basestring): txt = a.get('text', "") if isinstance(a.get('title', []), list): atitle = ' '.join(a.get('title', '')) elif isinstance(a.get('title', ""), basestring): atitle = a.get('title', "") for s in sites: if s in url: a = Article(text=txt, title=atitle, src=url, date=adate, nid=a['_id'], language_model=model) a.analyze(mxSetSize) parsed += 1 print "Parsed ", parsed, " Articles. Inserting into Database" mongo.collection.drop() for k, w in model.words.iteritems(): mongo.__process_word__(w) #Update Language Info langInfo = LanguageInfoModel_Mongo() keys = sorted(model.words.keys()) freq = model.getWordsByFrequency() langInfo.updateLanguage(lang, parsed, len(model.words.keys()), sorted(freq.keys())[len(freq) - 1], sites) return mongo
def generate_model(lang, sites, mxParse=-1, mxSetSize=3): model = LanguageModel(lang) mongo = LanguageModel_Mongo("", lang, None) parsed = 0 articleDB = ArticleDB() while (parsed < mxParse or (mxParse == -1 and parsed < articleDB.count())): a = articleDB.get(index=parsed) txt = ""#' '.join(a.get('text','')) adate = ' '.join(a.get('time','')) url = ""#''.join(a.get('url','')) atitle = "" if isinstance(a.get('url', []), list): url = ' '.join(a.get('url','')) elif isinstance(a.get('url', ""), basestring): url = a.get('url', "") if isinstance(a.get('text', []), list): txt = ' '.join(a.get('text','')) elif isinstance(a.get('text', ""), basestring): txt = a.get('text', "") if isinstance(a.get('title', []), list): atitle = ' '.join(a.get('title','')) elif isinstance(a.get('title', ""), basestring): atitle = a.get('title', "") for s in sites: if s in url: a = Article(text=txt, title=atitle, src=url, date=adate, nid=a['_id'], language_model=model) a.analyze(mxSetSize) parsed += 1 print "Parsed ", parsed, " Articles. Inserting into Database" mongo.collection.drop() for k, w in model.words.iteritems(): mongo.__process_word__(w) #Update Language Info langInfo = LanguageInfoModel_Mongo() keys = sorted(model.words.keys()) freq = model.getWordsByFrequency() langInfo.updateLanguage(lang, parsed, len(model.words.keys()), sorted(freq.keys())[len(freq)-1], sites) return mongo