def makeDoc2vecFile(pos_tweets,outfile,merge) : stopwords=util.txtTolist("lists/stopwords.txt") y=[] for tweet in pos_tweets : x=[tok[0].strip("@#") for tok in tweet if tok[1]=="A" or tok[1]=="N" or tok[1]=="^" or tok[1]=="V" or tok[1]=="@" or tok[1]=="#" ] x=[tok for tok in x if tok not in stopwords ] y.append(x) with open(outfile,"a+") as f : for xi in x : f.write(str(xi)+" ") f.write("\n") if merge : y=[item for sublist in y for item in sublist if item not in y ] return y
def makeDoc2vecFile(pos_tweets, outfile, merge): stopwords = util.txtTolist("lists/stopwords.txt") y = [] for tweet in pos_tweets: x = [ tok[0].strip("@#") for tok in tweet if tok[1] == "A" or tok[1] == "N" or tok[1] == "^" or tok[1] == "V" or tok[1] == "@" or tok[1] == "#" ] x = [tok for tok in x if tok not in stopwords] y.append(x) with open(outfile, "a+") as f: for xi in x: f.write(str(xi) + " ") f.write("\n") if merge: y = [item for sublist in y for item in sublist if item not in y] return y
print "\n EXTRACTING TWITTER DATA \n" util.createFilePath(PATH_TO_RAW_DATA) extract.getTweets(topics,PATH_TO_RAW_DATA,5000) """ print("\n PROCESSING DATA - CLEANING AND POS TAGGING \n") util.createFilePath(PATH_TO_CLEAN_DATA) clean.process(PATH_TO_RAW_DATA,PATH_TO_CLEAN_DATA,topics) print("\n EXTRACTING TOP-K TWEETS FROM DATA \n") for topic in topics : print topic dataFile=PATH_TO_CLEAN_DATA+"/data_"+topic+".txt" tweets=util.txtTolist(dataFile) k=[5,10,25,50,100] for ki in k : topk="TOP_"+str(ki) CosineSimilarityVSM=[] method="ALL_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method util.createFilePath(outPath) val=measures.entropy(tweets) print(topic+topk+method+" Entropy : "+str(val)) method="RANDOM_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
""" print "\n EXTRACTING TWITTER DATA \n" util.createFilePath(PATH_TO_RAW_DATA) extract.getTweets(topics,PATH_TO_RAW_DATA,5000) """ print("\n PROCESSING DATA - CLEANING AND POS TAGGING \n") util.createFilePath(PATH_TO_CLEAN_DATA) clean.process(PATH_TO_RAW_DATA, PATH_TO_CLEAN_DATA, topics) print("\n EXTRACTING TOP-K TWEETS FROM DATA \n") for topic in topics: print topic dataFile = PATH_TO_CLEAN_DATA + "/data_" + topic + ".txt" tweets = util.txtTolist(dataFile) k = [5, 10, 25, 50, 100] for ki in k: topk = "TOP_" + str(ki) CosineSimilarityVSM = [] method = "ALL_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method util.createFilePath(outPath) val = measures.entropy(tweets) print(topic + topk + method + " Entropy : " + str(val)) method = "RANDOM_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
def readInData(filename): data = util.txtTolist(filename) return data