Example #1
0
def writeDoc2vecSimMatrix(outfile, allTweets, results, create):
    if create:
        outfile1 = os.path.dirname(outfile) + "/Doc2vecModelTokens.txt"
        pos_tweets = tagger.runtagger_parse(
            allTweets)  #tokenizer and POS-tagger
        tokens = makeDoc2vecFile(pos_tweets, outfile1, False)
        sentence = doc2vec.TaggedLineDocument(
            outfile1)  #Imports in doc2vec format
        model = doc2vec.Doc2Vec(sentence,
                                size=100,
                                window=300,
                                min_count=10,
                                workers=4)  #makes doc2vec model
        model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt"
        model.save(model_name)
    else:
        model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt"
        model = doc2vec.Doc2Vec.load(model_name)
    for i in range(0, len(allTweets)):
        x = []
        for result in results:
            k = allTweets.index(result)
            x.append(str(model.docvecs.similarity(i, k)))
        with open(outfile, "a+") as f:
            writeFile = csv.writer(f)
            writeFile.writerow(x)
Example #2
0
def process(inPath,outPath,topics) :
	for topic in topics :
		inFile=inPath+'/'+topic+".csv" 
		tweets=util.csvTolist(inFile)
		tweets= [ str(tweet).strip("[']") for tweet in tweets ]
	
		print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets)))
		tweets=make_lowercase(tweets)
		tweets=remove_repetition(tweets)
		tweets=remove_newline(tweets)
		tweets=if_not_topic(tweets,topic.lower())

		#POS-Tagging of tweets
		pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]]
		tweets=common_except_url(pos_tweets)
		pos_tweets=tagger.runtagger_parse(tweets)
		
		print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets)))
		
		outFile=outPath+'/data_'+topic+".txt" 
		util.listTotxt(outFile,tweets,"w+") 
		outFile=outPath+'/POS_'+topic+".csv" 
		util.listTocsv(outFile,pos_tweets,"w+") 
def writeDoc2vecSimMatrix(outfile,allTweets,results,create) :
	if create :
		outfile1=os.path.dirname(outfile)+"/Doc2vecModelTokens.txt"
		pos_tweets=tagger.runtagger_parse(allTweets) #tokenizer and POS-tagger
		tokens=makeDoc2vecFile(pos_tweets,outfile1,False)
		sentence=doc2vec.TaggedLineDocument(outfile1) #Imports in doc2vec format
		model = doc2vec.Doc2Vec(sentence,size = 100, window = 300, min_count = 10, workers=4) #makes doc2vec model
		model_name = os.path.dirname(outfile)+"/Doc2vecModel.txt"
		model.save(model_name)	
	else :
		model_name = os.path.dirname(outfile)+"/Doc2vecModel.txt"	
		model=doc2vec.Doc2Vec.load(model_name)
	for i in range(0,len(allTweets)) :
		x=[]
		for result in results :
			k=allTweets.index(result)
			x.append(str(model.docvecs.similarity(i,k)))
		with open(outfile, "a+") as f :
		   	writeFile = csv.writer(f)
			writeFile.writerow(x)
Example #4
0
def GreedyAspectRanking(outfile,tweets,topic,k) :
		
	pos_tweets=tagger.runtagger_parse(tweets)
	aspects_tweet=get_aspect(pos_tweets) # tweetwise aspects [[asp1,asp2],[],[asp1]]
	"""
	aspect_freq=ranking.get_freq(aspects_tweet) 
	aspect_freq=sorted(aspect_freq,key=lambda x: int(x[1]),reverse=True)
	aspect_freq=error.correct(aspect_freq)
	aspects_sel=util.filter_rlist(aspect_freq,10,1)

	util.listTocsv(outfile1,aspects_sel)
	aspects=util.listfromlist(aspects_sel,0)
	#aspect_hits=ranking.pmi_list(aspects,topic,"results/pmi_"+topic+".csv")
	"""
	aspect_hits=util.csvTolist("results/pmi_"+topic+".csv")
	aspect_hits=sorted(aspect_hits,key=lambda x: float(x[1]),reverse=True)
	#util.listTocsv(outfile,aspect_hits)
	asp_hits=util.filter_rlist(aspect_hits,6,1)
	aspects1=util.listfromlist(asp_hits,0)

	results=algo.GreedyNormal(outfile,aspects_tweet,aspects1,tweets,k)
	return results
Example #5
0
def GreedyAspectRanking(outfile, tweets, topic, k):

    pos_tweets = tagger.runtagger_parse(tweets)
    aspects_tweet = get_aspect(
        pos_tweets)  # tweetwise aspects [[asp1,asp2],[],[asp1]]
    """
	aspect_freq=ranking.get_freq(aspects_tweet) 
	aspect_freq=sorted(aspect_freq,key=lambda x: int(x[1]),reverse=True)
	aspect_freq=error.correct(aspect_freq)
	aspects_sel=util.filter_rlist(aspect_freq,10,1)

	util.listTocsv(outfile1,aspects_sel)
	aspects=util.listfromlist(aspects_sel,0)
	#aspect_hits=ranking.pmi_list(aspects,topic,"results/pmi_"+topic+".csv")
	"""
    aspect_hits = util.csvTolist("results/pmi_" + topic + ".csv")
    aspect_hits = sorted(aspect_hits, key=lambda x: float(x[1]), reverse=True)
    #util.listTocsv(outfile,aspect_hits)
    asp_hits = util.filter_rlist(aspect_hits, 6, 1)
    aspects1 = util.listfromlist(asp_hits, 0)

    results = algo.GreedyNormal(outfile, aspects_tweet, aspects1, tweets, k)
    return results