コード例 #1
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def normGoogleWall(jresult):
    posts = list()
    page_count = 0
    if type(jresult) == list:
        for page in jresult:
            if page_count > 10:
                # revise the size in the future
                break
            for post in page["items"]:
                published_time = formatGoogleTime(post["published"])
                place = formatGooglePlace(post.get("location", ""), 2)
                info = post.get("object", "")
                if info != "":
                    text = info.get("content", "")
                    urls = getGoogleUrls(info.get("attachments", ""))
                    # a = time.time()
                    lang = ut.detectLang(text)
                    # b = time.time()
                    text_en = ut.translate(text, lang)
                    # c= time.time()
                    sentiment = ut.getSentiment(text_en)
                    # d= time.time()
                    topic_distri = ut.getTopic(text_en)
                    tf = ut.wordProcess(text, lang)
                    # e= time.time()
                    # print(b-a, c-b, d-c, e-d)
                    posts.append(
                        getPost(text, text_en, published_time, place, urls,
                                lang, sentiment, topic_distri, tf))
            page_count += 1
    return posts
コード例 #2
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def normGoogleWall(jresult):
	posts = list()
	page_count = 0
	if type(jresult) == list:
		for page in jresult:
			if page_count > 10:
				# revise the size in the future
				break
			for post in page["items"]:
				published_time = formatGoogleTime(post["published"])
				place = formatGooglePlace(post.get("location", ""), 2)
				info = post.get("object", "")
				if info != "":
					text = info.get("content", "")
					urls = getGoogleUrls(info.get("attachments", ""))
					# a = time.time()
					lang = ut.detectLang(text)
					# b = time.time()
					text_en = ut.translate(text, lang)
					# c= time.time()
					sentiment = ut.getSentiment(text_en)
					# d= time.time()
					topic_distri = ut.getTopic(text_en)
					tf = ut.wordProcess(text, lang)
					# e= time.time()
					# print(b-a, c-b, d-c, e-d)
					posts.append(getPost(text, text_en, published_time, place, urls, lang, sentiment, topic_distri, tf))
			page_count+=1
	return posts
コード例 #3
0
def vsm_main(fact, query, k, disable_corrrector=False):
    if query:
        words = wordProcess(query)
        if not disable_corrrector:
            words = [fact.corrector.correct(w) for w in words]
        qvector = fact.vsm.query_vector(words)
        result = fact.vsm.get_topK_list(
            qvector,
            k) if k and k > 0 else fact.vsm.get_sorted_scores_list(qvector)
        for item in result:
            print item[1], fact.filedict[item[0]]
        print '\033[1;35mTotal\033[0m:', len(result)
    else:
        print 'Missing query keywords'
コード例 #4
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def normTwitterWall(wall):
	posts = list()
	for post in wall:
		text = post.get("text", "")
		time = formatTwitterTime(post.get("created_at"))
		place = formatTwitterPlace(post["geo"], 2)
		urls = getTwitterUrls(post)
		lang = post.get("lang", "")
		if lang == "":
			lang = ut.detectLang(text)
		# translate text
		text_en = ut.translate(text, lang)
		sentiment = ut.getSentiment(text_en)
		topic_distri = ut.getTopic(text_en)
		tf = ut.wordProcess(text, lang)
		posts.append(getPost(text, text_en, time, place, urls, lang, sentiment, topic_distri, tf))
	return posts
コード例 #5
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def normTwitterWall(wall):
    posts = list()
    for post in wall:
        text = post.get("text", "")
        time = formatTwitterTime(post.get("created_at"))
        place = formatTwitterPlace(post["geo"], 2)
        urls = getTwitterUrls(post)
        lang = post.get("lang", "")
        if lang == "":
            lang = ut.detectLang(text)
        # translate text
        text_en = ut.translate(text, lang)
        sentiment = ut.getSentiment(text_en)
        topic_distri = ut.getTopic(text_en)
        tf = ut.wordProcess(text, lang)
        posts.append(
            getPost(text, text_en, time, place, urls, lang, sentiment,
                    topic_distri, tf))
    return posts
コード例 #6
0
 def Splite(self, fileString, fileNo):
     try:
         all_text = fileString
         lowerWords = utility.wordProcess(all_text)
         dictionary = {}
         address = 0
         offset = 0
         for lowerWord in lowerWords:
             if (lowerWord == '' or lowerWord in utility.deleteset
                     or lowerWord in utility.stopset):
                 address = address + 1
                 continue
             if not dictionary.has_key(lowerWord):
                 temp = singleList(fileNo, [address])
                 dictionary[lowerWord] = temp
             else:
                 dictionary[lowerWord].shows.append(address)
             offset = offset + len(lowerWord)
             address = address + 1
         #print dictionary
         return dictionary
     except Exception, E:
         print time.strftime('%Y-%m-%d %H:%M:%S--',
                             time.localtime(time.time())), Exception, ":", E
コード例 #7
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def getStringTag(string):
	tokens = list(ut.wordProcess(string, ut.detectLang(string)).keys())
	return tokens
コード例 #8
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def getStringTag(string):
    tokens = list(ut.wordProcess(string, ut.detectLang(string)).keys())
    return tokens