コード例 #1
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def normGoogleWall(jresult):
	posts = list()
	page_count = 0
	if type(jresult) == list:
		for page in jresult:
			if page_count > 10:
				# revise the size in the future
				break
			for post in page["items"]:
				published_time = formatGoogleTime(post["published"])
				place = formatGooglePlace(post.get("location", ""), 2)
				info = post.get("object", "")
				if info != "":
					text = info.get("content", "")
					urls = getGoogleUrls(info.get("attachments", ""))
					# a = time.time()
					lang = ut.detectLang(text)
					# b = time.time()
					text_en = ut.translate(text, lang)
					# c= time.time()
					sentiment = ut.getSentiment(text_en)
					# d= time.time()
					topic_distri = ut.getTopic(text_en)
					tf = ut.wordProcess(text, lang)
					# e= time.time()
					# print(b-a, c-b, d-c, e-d)
					posts.append(getPost(text, text_en, published_time, place, urls, lang, sentiment, topic_distri, tf))
			page_count+=1
	return posts
コード例 #2
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def normGoogleWall(jresult):
    posts = list()
    page_count = 0
    if type(jresult) == list:
        for page in jresult:
            if page_count > 10:
                # revise the size in the future
                break
            for post in page["items"]:
                published_time = formatGoogleTime(post["published"])
                place = formatGooglePlace(post.get("location", ""), 2)
                info = post.get("object", "")
                if info != "":
                    text = info.get("content", "")
                    urls = getGoogleUrls(info.get("attachments", ""))
                    # a = time.time()
                    lang = ut.detectLang(text)
                    # b = time.time()
                    text_en = ut.translate(text, lang)
                    # c= time.time()
                    sentiment = ut.getSentiment(text_en)
                    # d= time.time()
                    topic_distri = ut.getTopic(text_en)
                    tf = ut.wordProcess(text, lang)
                    # e= time.time()
                    # print(b-a, c-b, d-c, e-d)
                    posts.append(
                        getPost(text, text_en, published_time, place, urls,
                                lang, sentiment, topic_distri, tf))
            page_count += 1
    return posts
コード例 #3
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def normTwitterWall(wall):
	posts = list()
	for post in wall:
		text = post.get("text", "")
		time = formatTwitterTime(post.get("created_at"))
		place = formatTwitterPlace(post["geo"], 2)
		urls = getTwitterUrls(post)
		lang = post.get("lang", "")
		if lang == "":
			lang = ut.detectLang(text)
		# translate text
		text_en = ut.translate(text, lang)
		sentiment = ut.getSentiment(text_en)
		topic_distri = ut.getTopic(text_en)
		tf = ut.wordProcess(text, lang)
		posts.append(getPost(text, text_en, time, place, urls, lang, sentiment, topic_distri, tf))
	return posts
コード例 #4
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def normTwitterWall(wall):
    posts = list()
    for post in wall:
        text = post.get("text", "")
        time = formatTwitterTime(post.get("created_at"))
        place = formatTwitterPlace(post["geo"], 2)
        urls = getTwitterUrls(post)
        lang = post.get("lang", "")
        if lang == "":
            lang = ut.detectLang(text)
        # translate text
        text_en = ut.translate(text, lang)
        sentiment = ut.getSentiment(text_en)
        topic_distri = ut.getTopic(text_en)
        tf = ut.wordProcess(text, lang)
        posts.append(
            getPost(text, text_en, time, place, urls, lang, sentiment,
                    topic_distri, tf))
    return posts