def makeDoc2vecFile(pos_tweets,outfile,merge) :
	stopwords=util.txtTolist("lists/stopwords.txt")
	y=[]	
	for tweet in pos_tweets :
		x=[tok[0].strip("@#") for tok in tweet if tok[1]=="A" or tok[1]=="N" or tok[1]=="^" or tok[1]=="V" or tok[1]=="@" or tok[1]=="#" ]
		x=[tok for tok in x if tok not in stopwords ]
		y.append(x)
		with open(outfile,"a+") as f :
			for xi in x :
				f.write(str(xi)+" ")
			f.write("\n")
	if merge :
		y=[item for sublist in y for item in sublist if item not in y ]
	return y
Example #2
0
def makeDoc2vecFile(pos_tweets, outfile, merge):
    stopwords = util.txtTolist("lists/stopwords.txt")
    y = []
    for tweet in pos_tweets:
        x = [
            tok[0].strip("@#") for tok in tweet
            if tok[1] == "A" or tok[1] == "N" or tok[1] == "^" or tok[1] == "V"
            or tok[1] == "@" or tok[1] == "#"
        ]
        x = [tok for tok in x if tok not in stopwords]
        y.append(x)
        with open(outfile, "a+") as f:
            for xi in x:
                f.write(str(xi) + " ")
            f.write("\n")
    if merge:
        y = [item for sublist in y for item in sublist if item not in y]
    return y
Example #3
0
print "\n  EXTRACTING TWITTER DATA   \n"
util.createFilePath(PATH_TO_RAW_DATA)
extract.getTweets(topics,PATH_TO_RAW_DATA,5000)
"""
print("\n  PROCESSING DATA - CLEANING AND POS TAGGING  \n")
util.createFilePath(PATH_TO_CLEAN_DATA)
clean.process(PATH_TO_RAW_DATA,PATH_TO_CLEAN_DATA,topics)


print("\n EXTRACTING TOP-K TWEETS FROM DATA  \n")

for topic in topics :
	
	print topic
	dataFile=PATH_TO_CLEAN_DATA+"/data_"+topic+".txt"
	tweets=util.txtTolist(dataFile)
	
	k=[5,10,25,50,100]
	
	for ki in k :
		topk="TOP_"+str(ki)
		CosineSimilarityVSM=[]

		method="ALL_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
		util.createFilePath(outPath)
		val=measures.entropy(tweets)
		print(topic+topk+method+" Entropy : "+str(val))
	
		method="RANDOM_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
Example #4
0
"""
print "\n  EXTRACTING TWITTER DATA   \n"
util.createFilePath(PATH_TO_RAW_DATA)
extract.getTweets(topics,PATH_TO_RAW_DATA,5000)
"""
print("\n  PROCESSING DATA - CLEANING AND POS TAGGING  \n")
util.createFilePath(PATH_TO_CLEAN_DATA)
clean.process(PATH_TO_RAW_DATA, PATH_TO_CLEAN_DATA, topics)

print("\n EXTRACTING TOP-K TWEETS FROM DATA  \n")

for topic in topics:

    print topic
    dataFile = PATH_TO_CLEAN_DATA + "/data_" + topic + ".txt"
    tweets = util.txtTolist(dataFile)

    k = [5, 10, 25, 50, 100]

    for ki in k:
        topk = "TOP_" + str(ki)
        CosineSimilarityVSM = []

        method = "ALL_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
        util.createFilePath(outPath)
        val = measures.entropy(tweets)
        print(topic + topk + method + " Entropy : " + str(val))

        method = "RANDOM_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
def readInData(filename):

    data = util.txtTolist(filename)
    return data