def sutime_function(text): translator = Translator() traducere = translator.translate(text, src='ro', dest="en").text java_target = "java\\target" jar_files = os.path.join(os.path.dirname(__file__), java_target) sutime = SUTime(jars=jar_files, mark_time_ranges=True) ttext = [] ttype = [] tmpdictionar = {} for x in sutime.parse(traducere): for value, key in x.items(): if value == "text": valoare = convert_to_romana(key) ttext.append(valoare) elif value == "type": valoare2 = convert_to_romana(key) ttype.append(valoare2) for x in range(len(ttext)): try: tmpdictionar[ttype[x]].append(ttext[x]) except: tmpdictionar[ttype[x]] = [ttext[x]] return tmpdictionar
def extract_years(snippet, output): """ function extracts the dates and fill them with the computed confidence score of function extract_entities_textrazor :param snippet: :param output: :return: """ jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) res = json.dumps(sutime.parse(snippet), sort_keys=True, indent=4) dates_list = [] for i in range(len(res)): if res[i:i+5] == 'value': j = i+9 while res[j] != '"': j = j+1 dates_list.append(''.join(res[i+9:j])) dic_year = output['Y'] dates_list_new = {'entity':[], 'confidenceScore': [] } for i in range(len(dic_year['entity'])): for ele in dates_list: if ele.__contains__(dic_year['entity'][i][0]): if ele not in dates_list_new['entity']: dates_list_new['entity'].append(ele) dates_list_new['confidenceScore'].append(dic_year['confidenceScore'][i]) output['Y'] = dates_list_new return output
def __init__(self, path): # Initialize SUtime jar_files = os.path.join(os.path.dirname(path), 'jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=False, include_range=True)
def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) self.tz = pytz.timezone('US/Pacific') jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True)
def __init__(self, classifier_path=None, ner_path = None, sutime_jar_path = None): # Change the path according to your system if classifier_path is None: classifier_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.muc.7class.distsim.crf.ser.gz" if ner_path is None: ner_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\stanford-ner.jar" if sutime_jar_path is None: sutime_jar_path = "C:\stanford_corenlp\stanford-corenlp-full-2018-02-27\stanford-corenlp-full-2018-02-27" self.stanford_classifier = classifier_path self.stanford_ner_path = ner_path self.sutime_path = sutime_jar_path # Creating Tagger Object self.st = StanfordNERTagger(self.stanford_classifier, self.stanford_ner_path) self.su = SUTime(jars=self.sutime_path, mark_time_ranges=True, include_range=True) self.weather_terms = ["weather", "climate", "precipitation", "sun", "rain", "cloud","snow", "hot", "humid", "cold", "sunny", "windy","cloudy", "rainy", "snowy", "misty", "foggy", "colder","hotter", "warmer", "pleasant"] self.greet_terms= ["hello","hey","howdy","hello","hi", "yo", "yaw"] self.closure_terms = ["no", "nope", "thank you", "bye", "tata", "thanks", "that will be all", "that's it", "that'll be all"] self.day_terms = ["dawn", "dusk", "morning", "evening", "noon","afternoon", "night", "tonight", "midnight", "midday"] #, "hours"] self.date_terms = ["today", "tomorrow", "yesterday"]
def __init__(self): db_utils.setup_outgoing_config( ) # needs an outgoing config obj to check against self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN)
def __init__(self): # Twitter API setup auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) self.api = tweepy.API(auth) self.tweet_list = [] self.relevance_scores = [] # bad words response = requests.get(BAD_WORDS_URL) self.bad_words = response.text.split('\n') # stop words self.stopwords = list(stopwords.words('english')) # sutime jar_files = os.environ.get('JAR_FILES','/webapps/hackor/hackor/python-sutime/jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) # nltk data append nltk.data.path.append(os.environ.get('NLTK_CORPUS','/webapps/hackor/hackor/nltk_data'))
def extract_entitites(snippet): """ this function gets :param snippet: a snippet in English :return: and returns back the extracted person name, organization name, location and year in a dictionary namely output """ nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(snippet, properties={ 'annotators': 'ner', #'sutime' 'outputFormat': 'json', #'timeout': 1000, }) output = {'RN':[], 'U':[], 'Y':[]} """ for extracting the university and persons names""" for sent in range(len(res['sentences'])): for element in res['sentences'][sent]['tokens']: if element['ner'] == 'PERSON': output['RN'].append(element['word']) if element['ner'] == 'ORGANIZATION': #or element['ner'] == 'LOCATION' : output['U'].append(element['word']) """ for extracting the years""" jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) res = json.dumps(sutime.parse(snippet), sort_keys=True, indent=4) for i in range(len(res)): if res[i:i+5] == 'value': j = i+9 while res[j] != '"': j = j+1 output['Y'].append(''.join(res[i+9:j])) return output
class NLUWrapper(object): def __init__(self, host='localhost', port=5001, **kwargs): self.host, self.port = host, port self.sutime = SUTime(jars=os.path.join(os.path.dirname(__file__), 'python-sutime', 'jars'), mark_time_ranges=True) print 'Initialized with {}:{}'.format(self.host, self.port) def annotate(self, in_utterance, modules=()): sutime_response = None try: if 'SUTime' in modules: sutime_response = self.sutime.parse(in_utterance) modules = [module for module in modules if module != 'SUTime'] response = requests.post('http://{}:{}/annotate'.format( self.host, self.port), json={ 'state': { 'utterance': in_utterance }, 'modules': modules }, timeout=5) except requests.Timeout: return {} assert response.status_code == 200, 'Error calling the NLU service' result = response.json() if sutime_response is not None: result['annotations']['SUTime'] = sutime_response return result def annotate_sentiment(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'VaderNLTK']) return response['annotations']['sentiment'] def annotate_ner(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'StanfordNER']) return response['annotations'].get('ner', {}) def annotate_pos(self, in_utterance): response = self.annotate(in_utterance, modules=['Preprocessor', 'MorphoTagger']) return response['annotations'].get('postag', []) def annotate_abuse(self, in_utterance): response = self.annotate( in_utterance, modules=['Preprocessor', 'AlanaAbuseDetector']) return response['annotations'].get('abuse', {})
class DateLinker(BasePipeline): def __init__(self, resource_folder=None): self.annotator_name = 'Date_Linker' if resource_folder is None: self.resource_folder = os.path.join(os.path.dirname(__file__), '../resources/sutime/') self.sutime = SUTime(jars=self.resource_folder) def run(self, document): dates = self.sutime.parse(document.text) pattern = re.compile(r"^-*\d*-*\d*-*\d*-*$") for date in dates: if date["type"] == "DATE" and pattern.match(date["value"]): val = date["value"] if val[0] == '-': if len(val[1:]) == 4: stdform = val + '-00-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val[1:]) == 7: stdform = val + '-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val[1:]) == 10: stdform = val + 'T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' else: stdform = val + '^^<http://www.w3.org/2001/XMLSchema#dateTime>' else: if len(val) == 4: stdform = val + '-00-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val) == 7: stdform = val + '-00T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' elif len(val) == 10: stdform = val + 'T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime' else: stdform = val + '^^<http://www.w3.org/2001/XMLSchema#dateTime>' start = date["start"] end = date["end"] entity = Entity(uri=stdform, boundaries=(start, end), surfaceform=document.text[start:end], annotator=self.annotator_name) document.entities.append(entity) return document
class timeDelta: def __init__(self, path): # Initialize SUtime jar_files = os.path.join(os.path.dirname(path), 'jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=False, include_range=True) def get_times(self, text): # get all time values found by SUtime parsed = self.sutime.parse(text) values = [] for dic in parsed: values.append(dic['value']) return values
def sutime_with_mark_time_ranges(): return SUTime(jars=os.path.join( *[os.path.dirname(__file__), os.pardir, os.pardir, 'jars']), mark_time_ranges=True)
def sutime_with_mark_time_ranges(): return SUTime(mark_time_ranges=True, )
from flask import Flask from flask import request import os import json from sutime import SUTime import sys import json app = Flask(__name__) jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=False) @app.route('/') def homepage(): q = request.args.get('q') return json.dumps(parse(q)) def parse(s): return sutime.parse(s) if __name__ == '__main__': app.run(debug=True, use_reloader=True)
def loadSUtime(): __file__ = "/Users/harsha/Documents/cse635_AIR/Project/Main/Code/python-sutime-master/" jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) return sutime
def get_sutime(): global sutime if sutime is None: sutime = SUTime(jars=jar_path, mark_time_ranges=True) return sutime
def sutime_with_jvm_flags(): return SUTime( jars=os.path.join( *[os.path.dirname(__file__), os.pardir, os.pardir, "jars"]), jvm_flags=("-Xms256m", ), )
past = ['was', 'had', 'did'] set_past = set(past) file = open('task4_cases.txt', 'r').read().split('\n') # fname = 'task4_cases.txt' # with open(fname) as f: # content = f.readlines() x = "relevant positive" y = "relevant negative" z = "not relevant" stemmer = PorterStemmer() #path_jar = "/home/kanv/python-sutime/" jar_files = os.path.join(os.path.dirname(__file__), 'jars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) for line in file: #line = line.encode('utf-8') line = line.encode('ascii', 'ignore') sent_tokenize_list = sent_tokenize(line) #print sent_tokenize_list print "-------------------------------" res = [] dt = [] for sent in sent_tokenize_list: sent_list = re.split( '(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)(\s|[A-Z].*)', sent) print sent_list for sent_new in sent_list: #print sent_new
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.openspaces["CONSUMER_KEY"], s.openspaces["CONSUMER_SECRET"]) auth.set_access_token(s.openspaces["ACCESS_TOKEN"], s.openspaces["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ hours_mins = time_utils.get_local_clock_time() mention = "@{} just saw your Open Spaces tweet at {}." mention += " Pending approval we'll retweet a reminder before your event!" mention = mention.format(screen_name, hours_mins) try: self.api.update_status(status=mention) except: # if same user tweets valid openspaces tweet at exact same clock time # it causes a duplicate tweet which bot can't send loggly.info( "duplicate tweet by openspaces bot in send_mention_tweet") def send_slack_message(self, channel, message): """Send a slack message a channel channel options: #outgoing_tweets #need_review #event_conflict """ self.slacker.chat.post_message(channel, message) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def value_check(self, time_room_obj): """Returns a tuple with the counts of values extracted from a tweet in the parse_time_room method. This tuple is used to decide how bot will respond to tweet. """ num_room_values = len(time_room_obj["room"]) num_time_values = len(time_room_obj["date"]) return (num_room_values, num_time_values) def retweet_logic(self, tweet, tweet_id, screen_name, user_id): """Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # make sure both time and room extracted and only one val each val_check = self.value_check(time_room) if val_check == (1, 1): room = time_room["room"][0] date_mention = tweet_utils.check_date_mention(tweet) converted_time = time_utils.convert_to_utc(time_room["date"][0], date_mention) # check for a time and room conflict, only 1 set of retweets per event # default time range that a room is resrved for is -15 +30 mins conflict = db_utils.check_time_room_conflict(converted_time, room) if not conflict: # send message to slack when a tweet is scheduled to go out slack_message = "{} From: {}, id: {}".format( tweet, screen_name, user_id) self.send_slack_message('#outgoing_tweets', slack_message) self.send_mention_tweet(screen_name) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.schedule_tweets(screen_name, tweet, tweet_id, converted_time) loggly.info( "scheduled this tweet for retweet: {}".format(tweet)) else: message = """Tweet recived for an event bot is already scheduled to retweet about. Sender: {}, room: {}, time: {}, tweet: {} tweet_id: {} """ message = message.format(screen_name, room, converted_time, tweet, tweet_id) self.send_slack_message("#event_conflict", message) loggly.info(message) elif val_check == (0, 0): # tweet found but without valid time or room extracted, ignore pass else: # tweet with relevant information but not exactly 1 time & 1 room message = """Tweet found that needs review: {} tweet_id: {} screen_name: {}, user_id: {} """ message = message.format(tweet, tweet_id, screen_name, user_id) self.send_slack_message("#need_review", message)
def sutime_spanish(): return SUTime(language='spanish', )
class Streambot: """Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN) def setup_auth(self): """Set up auth stuff for api and return tweepy api object""" auth = tweepy.OAuthHandler(s.sender["CONSUMER_KEY"], s.sender["CONSUMER_SECRET"]) auth.set_access_token(s.sender["ACCESS_TOKEN"], s.sender["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=None): """Start stream, when matching tweet found on_status method called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == None: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def send_mention_tweet(self, screen_name, room, time): """Mention a user in a tweet from bot letting them know that their tweet has been recieved and that we will send out reminders about their event. """ mention = "@{} saw your openspaces tweet for: room {} at {}. Times should be relative to US/Pacific" mention = mention.format(screen_name, room, time) self.api.update_status(status=mention) def parse_time_room(self, tweet): """Get time and room number from a tweet using SUTime and tweet_utils""" extracted_time = self.sutime.parse(tweet) time_and_room = tweet_utils.get_time_and_room(tweet, extracted_time) return time_and_room def loadtest_logic(self, tweet, tweet_id, screen_name, user_id): """Logic similar to what is being used in the real bot so that we can load test how much volume it can handle before twitter kicks it off """ # use SUTime to parse a datetime out of tweet time_room = self.parse_time_room(tweet) # fake time in the future that imitates a event's start time local_tz = pytz.timezone('US/Pacific') sample_time = datetime.datetime.now(local_tz) + datetime.timedelta( minutes=10) sample_time = sample_time.strftime("%Y-%m-%d %H:%M:%S") converted_time = time_utils.convert_to_utc(sample_time) room = "r123" # check for a time and room conflict, only 1 set of retweets per event conflict = db_utils.check_time_room_conflict(converted_time, room) # send message to slack when a tweet is scheduled to go out slack_message = "{} From: {}, id: {}".format(tweet, screen_name, user_id) self.slacker.chat.post_message('#loadtest_tweets', slack_message) # This record lets us check that retweets not for same event db_utils.create_event(description=tweet, start=converted_time, location=room, creator=screen_name) tweet_utils.loadtest_schedule_tweets(screen_name, tweet, tweet_id, converted_time) print("tweet scheduled for retweet: {}".format(tweet))
def __init__(self, resource_folder=None): self.annotator_name = 'Date_Linker' if resource_folder is None: self.resource_folder = os.path.join(os.path.dirname(__file__), '../resources/sutime/') self.sutime = SUTime(jars=self.resource_folder)
def sutime(): return SUTime()
class Streambot: """ Stream Twitter and look for tweets that contain targeted words, when tweets found look for datetime and room, if present save tweet to OutgoingTweet model. Ex. bot = Streambot() # to run a stream looking for tweets about PyCon bot.run_stream(["PyCon"]) """ def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) self.tz = pytz.timezone('US/Pacific') jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) def setup_auth(self): """ Set up auth stuff for api and return tweepy api object """ auth = tweepy.OAuthHandler(s.listener["CONSUMER_KEY"], s.listener["CONSUMER_SECRET"]) auth.set_access_token(s.listener["ACCESS_TOKEN"], s.listener["ACCESS_TOKEN_SECRET"]) api = tweepy.API(auth) return api def run_stream(self, search_list=[]): """ Start stream, when matching tweet found on_status in StreamListener called. search_list arg is a list of terms that will be looked for in tweets """ if search_list == []: raise ValueError( "Need a list of search terms as arg to run_stream") stream = tweepy.Stream(auth=self.api.auth, listener=self.stream_listener) stream.filter(track=search_list) def convert_to_utc(self, talk_time): """ Convert the datetime string we get from SUTime to utcnow """ # get correct local year, month, dat local_date = datetime.now(self.tz) local_date_str = datetime.strftime(local_date, "%Y %m %d") year, month, day = local_date_str.split(" ") # get SUTime parsed talk time and extract hours, mins dt_obj = parse(talk_time) local_time_str = datetime.strftime(dt_obj, "%H %M") hours, mins = local_time_str.split(" ") # build up correct datetime obj, normalize & localize, switch to utc correct_dt = datetime(int(year), int(month), int(day), int(hours), int(mins)) tz_aware_local = self.tz.normalize(self.tz.localize(correct_dt)) local_as_utc = tz_aware_local.astimezone(pytz.utc) return local_as_utc def schedule_tweets(self, screen_name, tweet, tweet_id, talk_time): """ Take tweet and datetime, schedule num of reminder tweets at set intervals """ # check config table to see if autosend on config_obj = models.AppConfig.objects.latest("id") approved = 1 if config_obj.auto_send else 0 tweet_url = "https://twitter.com/{name}/status/{tweet_id}" embeded_tweet = tweet_url.format(name=screen_name, tweet_id=tweet_id) # set num of reminder tweets and interval in mins that tweets sent # num_tweets = 2 & interval = 15 sends 2 tweets 30 & 15 mins before num_tweets = 2 interval = 1 for mins in range(interval, (num_tweets * interval + 1), interval): remind_time = talk_time - timedelta(minutes=mins) message = "Coming up in {} minutes! {}".format(mins, embeded_tweet) print("message should be saved!!!") # saving the tweet to the OutgoingTweet table triggers celery stuff tweet_obj = models.Tweets(tweet=message, approved=approved, scheduled_time=remind_time) tweet_obj.save() def retweet_logic(self, tweet, tweet_id, screen_name): """ Use SUTime to try to parse a datetime out of a tweet, if successful save tweet to OutgoingTweet to be retweeted """ print(tweet, tweet_id) time_room = self.get_time_and_room(tweet) # check to make sure both time and room extracted and only one val for each val_check = [val for val in time_room.values() if len(val) == 1] if len(val_check) == 2: # way to mention a user after a valid tweet is recieved # time_stamp = datetime.datetime.utcnow() # mention = "@{} We saw your openspaces tweet!{}".format(screen_name, time_stamp) # self.api.update_status(status=mention) # need to make time from SUTime match time Django is using sutime_stuff = time_room["date"][0] print("sutime_stuff: {}".format(sutime_stuff)) talk_time = self.convert_to_utc(time_room["date"][0]) print("reult from convet to utc: {}".format(talk_time)) self.schedule_tweets(screen_name, tweet, tweet_id, talk_time) def get_time_and_room(self, tweet): """ Get time and room number from a tweet Written by Santi @ https://github.com/adavanisanti """ result = {} result["date"] = [] result["room"] = [] time_slots = self.sutime.parse(tweet) tweet_without_time = tweet for time_slot in time_slots: tweet_without_time = tweet_without_time.replace( time_slot.get("text"), "") result["date"].append(time_slot.get("value")) # filter_known_words = [word.lower() for word in word_tokenize(tweet_without_time) if word.lower() not in (self.stopwords + nltk.corpus.words.words())] filter_known_words = [ word.lower() for word in word_tokenize(tweet_without_time) ] # regular expression for room room_re = re.compile("([a-zA-Z](\d{3})[-+]?(\d{3})?)") for word in filter_known_words: if room_re.match(word): result["room"].append(room_re.match(word).group()) return result
def sutime_spanish(): return SUTime( jars=os.path.join( *[os.path.dirname(__file__), os.pardir, os.pardir, "jars"]), language="spanish", )
def sutime(): return SUTime(jars=os.path.join( *[os.path.dirname(__file__), os.pardir, os.pardir, 'jars']))
def __init__(self): self.api = self.setup_auth() self.stream_listener = StreamListener(self) jar_files = os.path.join(BASE_DIR, "python-sutime", "jars") self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) self.slacker = Slacker(s.SLACK_TOKEN)
class RetweetBot: def __init__(self): # Twitter API setup auth = tweepy.OAuthHandler(os.environ.get('CONSUMER_KEY'), os.environ.get('CONSUMER_SECRET')) auth.set_access_token(os.environ.get('ACCESS_TOKEN'), os.environ.get('ACCESS_TOKEN_SECRET')) self.api = tweepy.API(auth) self.tweet_list = [] self.relevance_scores = [] # bad words response = requests.get(BAD_WORDS_URL) self.bad_words = response.text.split('\n') # stop words self.stopwords = list(stopwords.words('english')) # sutime jar_files = os.environ.get('JAR_FILES', '../python-sutime/jars') self.sutime = SUTime(jars=jar_files, mark_time_ranges=True) # nltk data append nltk.data.path.append( os.environ.get('NLTK_CORPUS', '/webapps/hackor/hackor/nltk_data')) ''' Get all tweets ''' def get_tweets(self, topic="#pycon", quantity=10, result_type="recent,popular"): tweet_list = self.api.search(q=topic, count=quantity, lang='en', result_type=result_type) print("Retrieved {} candidate tweets.".format(len(tweet_list))) self.tweet_list += tweet_list def clear_tweets(self): self.tweet_list = [] self.relevance_scores = [] ''' Defining relevance score as the importance of the user tweeting Features: tweeter followers, friends, ratio number of hashtags in the tweet (smaller the better) (PageRank?) Remove tweets that have any bad words ''' def score(self, tweet): if not self.isSafe(tweet.text): return MAX_NEGATIVE if tweet.text.startswith('RT'): return MAX_NEGATIVE # influencer ratio influencer_ratio = 0 if tweet.user.friends_count: influencer_ratio = tweet.user.followers_count / tweet.user.friends_count #number of hashtags hashtags = tweet.text.count('#') #hashtag word length hashtagcount = 0 for word in tweet.text.split(): if word.startswith('#'): hashtagcount += len(word) final_score = influencer_ratio * (hashtagcount / 140) * 1.0 / ( 1 + hashtags) * tweet.favorite_count final_score = 1.0 return final_score ''' Computing Relevance for all tweets ''' def compute_relevance_scores(self): for _id, tweet in enumerate(self.tweet_list): if self.score(tweet) > 0.0: self.relevance_scores.append((_id, self.score(tweet))) self.relevance_scores.sort(key=lambda tup: tup[1], reverse=True) def compose_relevant_slack_messages(self, count=1): messages = [] if self.relevance_scores: message = '' for score in self.relevance_scores[0:count]: tweet_score = score[1] print tweet_score tweet = self.tweet_list[score[0]] message = "RT <https://twitter.com/" + tweet.user.screen_name + "|" + tweet.user.screen_name + ">" + " " + tweet.text message += "\n <https://twitter.com/" + tweet.user.screen_name + "/status/" + str( tweet.id) + "|Original Tweet>" messages.append(message) return messages def isSafe(self, tweet): result = True ret = tweet.replace('#', '') for word in self.bad_words: regex = r"\b(?=\w)" + re.escape(word) + r"\b(?!\w)" if re.search(regex, ret, re.IGNORECASE): result = False break return result ''' Get time and room number from a tweet ''' def get_time_and_room(self, tweet): result = {} result['date'] = [] result['room'] = [] time_slots = self.sutime.parse(tweet) tweet_without_time = tweet for time_slot in time_slots: tweet_without_time = tweet_without_time.replace( time_slot.get('text'), '') result['date'].append(time_slot.get('value')) filter_known_words = [ word.lower() for word in word_tokenize(tweet_without_time) if word.lower() not in (self.stopwords + nltk.corpus.words.words()) ] # regular expression for room room_re = re.compile('([a-zA-Z](\d{3})[-+]?(\d{3})?)') for word in filter_known_words: if room_re.match(word): result['room'].append(room_re.match(word).group()) return result
from sutime import SUTime from collections import defaultdict import numpy as np debug = True #location global vars stanford_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'stanfordjars') st = StanfordNERTagger(os.path.join(stanford_dir, 'ner-model.ser.gz'), os.path.join(stanford_dir, 'stanford-ner.jar')) st._stanford_jar = os.path.join(stanford_dir, '*') place_to_coords = {} url_base = 'https://maps.googleapis.com/maps/api/place/textsearch/json' api_key = 'AIzaSyAVat82-OUFKC9GpyOi3LNyQKwxE2KWY9U' #time global vars jar_files = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sutimejars') sutime = SUTime(jars=jar_files, mark_time_ranges=True) #FB api global vars app_id = "1696549057338916" app_secret = "21090405ac37194a1d4578aeb2371845" # DO NOT SHARE WITH ANYONE! access_token = app_id + "|" + app_secret #classifier global vars def unpickle(): pickle_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'pickles') with open(os.path.join(pickle_dir, 'clf_driver.pkl'), 'rb') as fid: clf_driver = pickle.load(fid) with open(os.path.join(pickle_dir, 'clf_roundtrip.pkl'), 'rb') as fid: clf_roundtrip = pickle.load(fid) with open(os.path.join(pickle_dir, 'clf_relevant.pkl'), 'rb') as fid: clf_relevant = pickle.load(fid)
import os import json from sutime import SUTime if __name__ == '__main__': test_case = u'I need a desk for tomorrow from 2pm to 3pm' jar_files = 'C:\Users\Leandra\Anaconda2\lib\site-packages\sutime\jars' jar_files = 'C:\Users\Leandra\Documents\Fall2016\NLP\carpool-search\jars' print(jar_files) sutime = SUTime(jars=jar_files, mark_time_ranges=True) print(json.dumps(sutime.parse(test_case), sort_keys=True, indent=4))