def main(): data_folder = 'data/sources/wikipedia' models_folder = 'classifier/models' save_loc = 'site/wikiclassify/wiki' if not os.path.exists(data_folder): os.makedirs(data_folder) if next(os.walk(data_folder))[1]: retrain = False if retrain: input, target, classes = data.sample(data_folder) model = classifier.build(input.shape, target.shape) classifier.train(model, input, target) classifier.save(models_folder, model, classes) else: model, classes = classifier.load(models_folder, sorted(os.listdir(models_folder))[-1]) for root, dirs, files in os.walk(data_folder): for file in files: if not file.startswith('.'): with open(root+'/'+file) as f: input = data.str2mat(f.read()) output = classifier.run(model, input)
def retrieve_tweets(): classif = train() # called when a new tweets is retrieved from streaming @receiver(streamer.tweet_retrieved) def my_callback(sender, **kwargs): tweet = kwargs['tweet'] if tweet['coordinates']: tweet_id = tweet['id'] user = tweet['user']['name'].encode('utf-8') lat = tweet['coordinates']['coordinates'][1] lng = tweet['coordinates']['coordinates'][0] text = tweet['text'].encode('utf-8') if tweet_id and user and lat and lng and text: # classify tweets (tourism or nontourism) classification = classif.classify(feature_extractor_lda_tripadvisor_top_words_weights(text)) # for tourism-related tweets, classify as being positive or negative if classification == 'tourism': sentiment = sentiment_analyzer.classify(feature_extractor(tweet)) print 'Label: {} || Tweet: {}'.format(sentiment, text) # create Tweet object and save to db tweet_obj = Tweet.objects.create(tweet_id=tweet_id, user=user, lat=lat, lng=lng, text=text, classification=classification, sentiment=sentiment) tweet_obj.save() # call streamer streamer.stream() return None
def retrieve_tweets(): classif = train() # called when a new tweets is retrieved from streaming @receiver(streamer.tweet_retrieved) def my_callback(sender, **kwargs): tweet = kwargs['tweet'] if tweet['coordinates']: tweet_id = tweet['id'] user = tweet['user']['name'].encode('utf-8') lat = tweet['coordinates']['coordinates'][1] lng = tweet['coordinates']['coordinates'][0] text = tweet['text'].encode('utf-8') if tweet_id and user and lat and lng and text: # classify tweets (tourism or nontourism) classification = classif.classify( feature_extractor_lda_tripadvisor_top_words_weights(text)) # for tourism-related tweets, classify as being positive or negative if classification == 'tourism': sentiment = sentiment_analyzer.classify( feature_extractor(tweet)) print 'Label: {} || Tweet: {}'.format(sentiment, text) # create Tweet object and save to db tweet_obj = Tweet.objects.create( tweet_id=tweet_id, user=user, lat=lat, lng=lng, text=text, classification=classification, sentiment=sentiment) tweet_obj.save() # call streamer streamer.stream() return None