def normGoogleWall(jresult): posts = list() page_count = 0 if type(jresult) == list: for page in jresult: if page_count > 10: # revise the size in the future break for post in page["items"]: published_time = formatGoogleTime(post["published"]) place = formatGooglePlace(post.get("location", ""), 2) info = post.get("object", "") if info != "": text = info.get("content", "") urls = getGoogleUrls(info.get("attachments", "")) # a = time.time() lang = ut.detectLang(text) # b = time.time() text_en = ut.translate(text, lang) # c= time.time() sentiment = ut.getSentiment(text_en) # d= time.time() topic_distri = ut.getTopic(text_en) tf = ut.wordProcess(text, lang) # e= time.time() # print(b-a, c-b, d-c, e-d) posts.append( getPost(text, text_en, published_time, place, urls, lang, sentiment, topic_distri, tf)) page_count += 1 return posts
def normGoogleWall(jresult): posts = list() page_count = 0 if type(jresult) == list: for page in jresult: if page_count > 10: # revise the size in the future break for post in page["items"]: published_time = formatGoogleTime(post["published"]) place = formatGooglePlace(post.get("location", ""), 2) info = post.get("object", "") if info != "": text = info.get("content", "") urls = getGoogleUrls(info.get("attachments", "")) # a = time.time() lang = ut.detectLang(text) # b = time.time() text_en = ut.translate(text, lang) # c= time.time() sentiment = ut.getSentiment(text_en) # d= time.time() topic_distri = ut.getTopic(text_en) tf = ut.wordProcess(text, lang) # e= time.time() # print(b-a, c-b, d-c, e-d) posts.append(getPost(text, text_en, published_time, place, urls, lang, sentiment, topic_distri, tf)) page_count+=1 return posts
def vsm_main(fact, query, k, disable_corrrector=False): if query: words = wordProcess(query) if not disable_corrrector: words = [fact.corrector.correct(w) for w in words] qvector = fact.vsm.query_vector(words) result = fact.vsm.get_topK_list( qvector, k) if k and k > 0 else fact.vsm.get_sorted_scores_list(qvector) for item in result: print item[1], fact.filedict[item[0]] print '\033[1;35mTotal\033[0m:', len(result) else: print 'Missing query keywords'
def normTwitterWall(wall): posts = list() for post in wall: text = post.get("text", "") time = formatTwitterTime(post.get("created_at")) place = formatTwitterPlace(post["geo"], 2) urls = getTwitterUrls(post) lang = post.get("lang", "") if lang == "": lang = ut.detectLang(text) # translate text text_en = ut.translate(text, lang) sentiment = ut.getSentiment(text_en) topic_distri = ut.getTopic(text_en) tf = ut.wordProcess(text, lang) posts.append(getPost(text, text_en, time, place, urls, lang, sentiment, topic_distri, tf)) return posts
def normTwitterWall(wall): posts = list() for post in wall: text = post.get("text", "") time = formatTwitterTime(post.get("created_at")) place = formatTwitterPlace(post["geo"], 2) urls = getTwitterUrls(post) lang = post.get("lang", "") if lang == "": lang = ut.detectLang(text) # translate text text_en = ut.translate(text, lang) sentiment = ut.getSentiment(text_en) topic_distri = ut.getTopic(text_en) tf = ut.wordProcess(text, lang) posts.append( getPost(text, text_en, time, place, urls, lang, sentiment, topic_distri, tf)) return posts
def Splite(self, fileString, fileNo): try: all_text = fileString lowerWords = utility.wordProcess(all_text) dictionary = {} address = 0 offset = 0 for lowerWord in lowerWords: if (lowerWord == '' or lowerWord in utility.deleteset or lowerWord in utility.stopset): address = address + 1 continue if not dictionary.has_key(lowerWord): temp = singleList(fileNo, [address]) dictionary[lowerWord] = temp else: dictionary[lowerWord].shows.append(address) offset = offset + len(lowerWord) address = address + 1 #print dictionary return dictionary except Exception, E: print time.strftime('%Y-%m-%d %H:%M:%S--', time.localtime(time.time())), Exception, ":", E
def getStringTag(string): tokens = list(ut.wordProcess(string, ut.detectLang(string)).keys()) return tokens