def writeDoc2vecSimMatrix(outfile, allTweets, results, create): if create: outfile1 = os.path.dirname(outfile) + "/Doc2vecModelTokens.txt" pos_tweets = tagger.runtagger_parse( allTweets) #tokenizer and POS-tagger tokens = makeDoc2vecFile(pos_tweets, outfile1, False) sentence = doc2vec.TaggedLineDocument( outfile1) #Imports in doc2vec format model = doc2vec.Doc2Vec(sentence, size=100, window=300, min_count=10, workers=4) #makes doc2vec model model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt" model.save(model_name) else: model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt" model = doc2vec.Doc2Vec.load(model_name) for i in range(0, len(allTweets)): x = [] for result in results: k = allTweets.index(result) x.append(str(model.docvecs.similarity(i, k))) with open(outfile, "a+") as f: writeFile = csv.writer(f) writeFile.writerow(x)
def process(inPath,outPath,topics) : for topic in topics : inFile=inPath+'/'+topic+".csv" tweets=util.csvTolist(inFile) tweets= [ str(tweet).strip("[']") for tweet in tweets ] print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets))) tweets=make_lowercase(tweets) tweets=remove_repetition(tweets) tweets=remove_newline(tweets) tweets=if_not_topic(tweets,topic.lower()) #POS-Tagging of tweets pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]] tweets=common_except_url(pos_tweets) pos_tweets=tagger.runtagger_parse(tweets) print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets))) outFile=outPath+'/data_'+topic+".txt" util.listTotxt(outFile,tweets,"w+") outFile=outPath+'/POS_'+topic+".csv" util.listTocsv(outFile,pos_tweets,"w+")
def writeDoc2vecSimMatrix(outfile,allTweets,results,create) : if create : outfile1=os.path.dirname(outfile)+"/Doc2vecModelTokens.txt" pos_tweets=tagger.runtagger_parse(allTweets) #tokenizer and POS-tagger tokens=makeDoc2vecFile(pos_tweets,outfile1,False) sentence=doc2vec.TaggedLineDocument(outfile1) #Imports in doc2vec format model = doc2vec.Doc2Vec(sentence,size = 100, window = 300, min_count = 10, workers=4) #makes doc2vec model model_name = os.path.dirname(outfile)+"/Doc2vecModel.txt" model.save(model_name) else : model_name = os.path.dirname(outfile)+"/Doc2vecModel.txt" model=doc2vec.Doc2Vec.load(model_name) for i in range(0,len(allTweets)) : x=[] for result in results : k=allTweets.index(result) x.append(str(model.docvecs.similarity(i,k))) with open(outfile, "a+") as f : writeFile = csv.writer(f) writeFile.writerow(x)
def GreedyAspectRanking(outfile,tweets,topic,k) : pos_tweets=tagger.runtagger_parse(tweets) aspects_tweet=get_aspect(pos_tweets) # tweetwise aspects [[asp1,asp2],[],[asp1]] """ aspect_freq=ranking.get_freq(aspects_tweet) aspect_freq=sorted(aspect_freq,key=lambda x: int(x[1]),reverse=True) aspect_freq=error.correct(aspect_freq) aspects_sel=util.filter_rlist(aspect_freq,10,1) util.listTocsv(outfile1,aspects_sel) aspects=util.listfromlist(aspects_sel,0) #aspect_hits=ranking.pmi_list(aspects,topic,"results/pmi_"+topic+".csv") """ aspect_hits=util.csvTolist("results/pmi_"+topic+".csv") aspect_hits=sorted(aspect_hits,key=lambda x: float(x[1]),reverse=True) #util.listTocsv(outfile,aspect_hits) asp_hits=util.filter_rlist(aspect_hits,6,1) aspects1=util.listfromlist(asp_hits,0) results=algo.GreedyNormal(outfile,aspects_tweet,aspects1,tweets,k) return results
def GreedyAspectRanking(outfile, tweets, topic, k): pos_tweets = tagger.runtagger_parse(tweets) aspects_tweet = get_aspect( pos_tweets) # tweetwise aspects [[asp1,asp2],[],[asp1]] """ aspect_freq=ranking.get_freq(aspects_tweet) aspect_freq=sorted(aspect_freq,key=lambda x: int(x[1]),reverse=True) aspect_freq=error.correct(aspect_freq) aspects_sel=util.filter_rlist(aspect_freq,10,1) util.listTocsv(outfile1,aspects_sel) aspects=util.listfromlist(aspects_sel,0) #aspect_hits=ranking.pmi_list(aspects,topic,"results/pmi_"+topic+".csv") """ aspect_hits = util.csvTolist("results/pmi_" + topic + ".csv") aspect_hits = sorted(aspect_hits, key=lambda x: float(x[1]), reverse=True) #util.listTocsv(outfile,aspect_hits) asp_hits = util.filter_rlist(aspect_hits, 6, 1) aspects1 = util.listfromlist(asp_hits, 0) results = algo.GreedyNormal(outfile, aspects_tweet, aspects1, tweets, k) return results